1/* Extended regular expression matching and search library.
2   Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <http://www.gnu.org/licenses/>.  */
19
20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21					  size_t length, reg_syntax_t syntax);
22static void re_compile_fastmap_iter (regex_t *bufp,
23				     const re_dfastate_t *init_state,
24				     char *fastmap);
25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26#ifdef RE_ENABLE_I18N
27static void free_charset (re_charset_t *cset);
28#endif /* RE_ENABLE_I18N */
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36			       reg_errcode_t (fn (void *, bin_tree_t *)),
37			       void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39				reg_errcode_t (fn (void *, bin_tree_t *)),
40				void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44				 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50				   unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53					 int node, int root);
54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55static int fetch_number (re_string_t *input, re_token_t *token,
56			 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
58			reg_syntax_t syntax) internal_function;
59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60			  reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62				  re_token_t *token, reg_syntax_t syntax,
63				  int nest, reg_errcode_t *err);
64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65				 re_token_t *token, reg_syntax_t syntax,
66				 int nest, reg_errcode_t *err);
67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68				     re_token_t *token, reg_syntax_t syntax,
69				     int nest, reg_errcode_t *err);
70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71				  re_token_t *token, reg_syntax_t syntax,
72				  int nest, reg_errcode_t *err);
73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74				 re_dfa_t *dfa, re_token_t *token,
75				 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77				      re_token_t *token, reg_syntax_t syntax,
78				      reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80					    re_string_t *regexp,
81					    re_token_t *token, int token_len,
82					    re_dfa_t *dfa,
83					    reg_syntax_t syntax,
84					    int accept_hyphen);
85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86					  re_string_t *regexp,
87					  re_token_t *token);
88#ifdef RE_ENABLE_I18N
89static reg_errcode_t build_equiv_class (bitset_t sbcset,
90					re_charset_t *mbcset,
91					int *equiv_class_alloc,
92					const unsigned char *name);
93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94				      bitset_t sbcset,
95				      re_charset_t *mbcset,
96				      int *char_class_alloc,
97				      const unsigned char *class_name,
98				      reg_syntax_t syntax);
99#else  /* not RE_ENABLE_I18N */
100static reg_errcode_t build_equiv_class (bitset_t sbcset,
101					const unsigned char *name);
102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103				      bitset_t sbcset,
104				      const unsigned char *class_name,
105				      reg_syntax_t syntax);
106#endif /* not RE_ENABLE_I18N */
107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108				       RE_TRANSLATE_TYPE trans,
109				       const unsigned char *class_name,
110				       const unsigned char *extra,
111				       int non_match, reg_errcode_t *err);
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113				bin_tree_t *left, bin_tree_t *right,
114				re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116				      bin_tree_t *left, bin_tree_t *right,
117				      const re_token_t *token);
118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122
123/* This table gives an error message for each of the error codes listed
124   in regex.h.  Obviously the order here has to be same as there.
125   POSIX doesn't require that we do anything for REG_NOERROR,
126   but why not be nice?  */
127
128const char __re_error_msgid[] attribute_hidden =
129  {
130#define REG_NOERROR_IDX	0
131    gettext_noop ("Success")	/* REG_NOERROR */
132    "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134    gettext_noop ("No match")	/* REG_NOMATCH */
135    "\0"
136#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138    "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141    "\0"
142#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144    "\0"
145#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147    "\0"
148#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150    "\0"
151#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153    "\0"
154#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156    "\0"
157#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159    "\0"
160#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162    "\0"
163#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164    gettext_noop ("Invalid range end")	/* REG_ERANGE */
165    "\0"
166#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167    gettext_noop ("Memory exhausted") /* REG_ESPACE */
168    "\0"
169#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171    "\0"
172#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173    gettext_noop ("Premature end of regular expression") /* REG_EEND */
174    "\0"
175#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176    gettext_noop ("Regular expression too big") /* REG_ESIZE */
177    "\0"
178#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180  };
181
182const size_t __re_error_msgid_idx[] attribute_hidden =
183  {
184    REG_NOERROR_IDX,
185    REG_NOMATCH_IDX,
186    REG_BADPAT_IDX,
187    REG_ECOLLATE_IDX,
188    REG_ECTYPE_IDX,
189    REG_EESCAPE_IDX,
190    REG_ESUBREG_IDX,
191    REG_EBRACK_IDX,
192    REG_EPAREN_IDX,
193    REG_EBRACE_IDX,
194    REG_BADBR_IDX,
195    REG_ERANGE_IDX,
196    REG_ESPACE_IDX,
197    REG_BADRPT_IDX,
198    REG_EEND_IDX,
199    REG_ESIZE_IDX,
200    REG_ERPAREN_IDX
201  };
202
203/* Entry points for GNU code.  */
204
205/* re_compile_pattern is the GNU regular expression compiler: it
206   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207   Returns 0 if the pattern was valid, otherwise an error string.
208
209   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210   are set in BUFP on entry.  */
211
212const char *
213re_compile_pattern (pattern, length, bufp)
214    const char *pattern;
215    size_t length;
216    struct re_pattern_buffer *bufp;
217{
218  reg_errcode_t ret;
219
220  /* And GNU code determines whether or not to get register information
221     by passing null for the REGS argument to re_match, etc., not by
222     setting no_sub, unless RE_NO_SUB is set.  */
223  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
224
225  /* Match anchors at newline.  */
226  bufp->newline_anchor = 1;
227
228  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
229
230  if (!ret)
231    return NULL;
232  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233}
234#ifdef _LIBC
235weak_alias (__re_compile_pattern, re_compile_pattern)
236#endif
237
238/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
239   also be assigned to arbitrarily: each pattern buffer stores its own
240   syntax, so it can be changed between regex compilations.  */
241/* This has no initializer because initialized variables in Emacs
242   become read-only after dumping.  */
243reg_syntax_t re_syntax_options;
244
245
246/* Specify the precise syntax of regexps for compilation.  This provides
247   for compatibility for various utilities which historically have
248   different, incompatible syntaxes.
249
250   The argument SYNTAX is a bit mask comprised of the various bits
251   defined in regex.h.  We return the old syntax.  */
252
253reg_syntax_t
254re_set_syntax (syntax)
255    reg_syntax_t syntax;
256{
257  reg_syntax_t ret = re_syntax_options;
258
259  re_syntax_options = syntax;
260  return ret;
261}
262#ifdef _LIBC
263weak_alias (__re_set_syntax, re_set_syntax)
264#endif
265
266int
267re_compile_fastmap (bufp)
268    struct re_pattern_buffer *bufp;
269{
270  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
271  char *fastmap = bufp->fastmap;
272
273  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275  if (dfa->init_state != dfa->init_state_word)
276    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277  if (dfa->init_state != dfa->init_state_nl)
278    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279  if (dfa->init_state != dfa->init_state_begbuf)
280    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281  bufp->fastmap_accurate = 1;
282  return 0;
283}
284#ifdef _LIBC
285weak_alias (__re_compile_fastmap, re_compile_fastmap)
286#endif
287
288static inline void
289__attribute ((always_inline))
290re_set_fastmap (char *fastmap, int icase, int ch)
291{
292  fastmap[ch] = 1;
293  if (icase)
294    fastmap[tolower (ch)] = 1;
295}
296
297/* Helper function for re_compile_fastmap.
298   Compile fastmap for the initial_state INIT_STATE.  */
299
300static void
301re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302			 char *fastmap)
303{
304  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
305  int node_cnt;
306  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308    {
309      int node = init_state->nodes.elems[node_cnt];
310      re_token_type_t type = dfa->nodes[node].type;
311
312      if (type == CHARACTER)
313	{
314	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315#ifdef RE_ENABLE_I18N
316	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
317	    {
318	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
319	      wchar_t wc;
320	      mbstate_t state;
321
322	      p = buf;
323	      *p++ = dfa->nodes[node].opr.c;
324	      while (++node < dfa->nodes_len
325		     &&	dfa->nodes[node].type == CHARACTER
326		     && dfa->nodes[node].mb_partial)
327		*p++ = dfa->nodes[node].opr.c;
328	      memset (&state, '\0', sizeof (state));
329	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
330			     &state) == p - buf
331		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
332		      != (size_t) -1))
333		re_set_fastmap (fastmap, 0, buf[0]);
334	    }
335#endif
336	}
337      else if (type == SIMPLE_BRACKET)
338	{
339	  int i, ch;
340	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
341	    {
342	      int j;
343	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
344	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
345		if (w & ((bitset_word_t) 1 << j))
346		  re_set_fastmap (fastmap, icase, ch);
347	    }
348	}
349#ifdef RE_ENABLE_I18N
350      else if (type == COMPLEX_BRACKET)
351	{
352	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
353	  int i;
354
355# ifdef _LIBC
356	  /* See if we have to try all bytes which start multiple collation
357	     elements.
358	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
359		  collation element, and don't catch 'b' since 'b' is
360		  the only collation element which starts from 'b' (and
361		  it is caught by SIMPLE_BRACKET).  */
362	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
363		  && (cset->ncoll_syms || cset->nranges))
364		{
365		  const int32_t *table = (const int32_t *)
366		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
367		  for (i = 0; i < SBC_MAX; ++i)
368		    if (table[i] < 0)
369		      re_set_fastmap (fastmap, icase, i);
370		}
371# endif /* _LIBC */
372
373	  /* See if we have to start the match at all multibyte characters,
374	     i.e. where we would not find an invalid sequence.  This only
375	     applies to multibyte character sets; for single byte character
376	     sets, the SIMPLE_BRACKET again suffices.  */
377	  if (dfa->mb_cur_max > 1
378	      && (cset->nchar_classes || cset->non_match || cset->nranges
379# ifdef _LIBC
380		  || cset->nequiv_classes
381# endif /* _LIBC */
382		 ))
383	    {
384	      unsigned char c = 0;
385	      do
386		{
387		  mbstate_t mbs;
388		  memset (&mbs, 0, sizeof (mbs));
389		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
390		    re_set_fastmap (fastmap, false, (int) c);
391		}
392	      while (++c != 0);
393	    }
394
395	  else
396	    {
397	      /* ... Else catch all bytes which can start the mbchars.  */
398	      for (i = 0; i < cset->nmbchars; ++i)
399		{
400		  char buf[256];
401		  mbstate_t state;
402		  memset (&state, '\0', sizeof (state));
403		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
404		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
405		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
406		    {
407		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
408			  != (size_t) -1)
409			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
410		    }
411		}
412	    }
413	}
414#endif /* RE_ENABLE_I18N */
415      else if (type == OP_PERIOD
416#ifdef RE_ENABLE_I18N
417	       || type == OP_UTF8_PERIOD
418#endif /* RE_ENABLE_I18N */
419	       || type == END_OF_RE)
420	{
421	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
422	  if (type == END_OF_RE)
423	    bufp->can_be_null = 1;
424	  return;
425	}
426    }
427}
428
429/* Entry point for POSIX code.  */
430/* regcomp takes a regular expression as a string and compiles it.
431
432   PREG is a regex_t *.  We do not expect any fields to be initialized,
433   since POSIX says we shouldn't.  Thus, we set
434
435     `buffer' to the compiled pattern;
436     `used' to the length of the compiled pattern;
437     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438       REG_EXTENDED bit in CFLAGS is set; otherwise, to
439       RE_SYNTAX_POSIX_BASIC;
440     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441     `fastmap' to an allocated space for the fastmap;
442     `fastmap_accurate' to zero;
443     `re_nsub' to the number of subexpressions in PATTERN.
444
445   PATTERN is the address of the pattern string.
446
447   CFLAGS is a series of bits which affect compilation.
448
449     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450     use POSIX basic syntax.
451
452     If REG_NEWLINE is set, then . and [^...] don't match newline.
453     Also, regexec will try a match beginning after every newline.
454
455     If REG_ICASE is set, then we considers upper- and lowercase
456     versions of letters to be equivalent when matching.
457
458     If REG_NOSUB is set, then when PREG is passed to regexec, that
459     routine will report only success or failure, and nothing about the
460     registers.
461
462   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
463   the return codes and their meanings.)  */
464
465int
466regcomp (preg, pattern, cflags)
467    regex_t *__restrict preg;
468    const char *__restrict pattern;
469    int cflags;
470{
471  reg_errcode_t ret;
472  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
473			 : RE_SYNTAX_POSIX_BASIC);
474
475  preg->buffer = NULL;
476  preg->allocated = 0;
477  preg->used = 0;
478
479  /* Try to allocate space for the fastmap.  */
480  preg->fastmap = re_malloc (char, SBC_MAX);
481  if (BE (preg->fastmap == NULL, 0))
482    return REG_ESPACE;
483
484  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
485
486  /* If REG_NEWLINE is set, newlines are treated differently.  */
487  if (cflags & REG_NEWLINE)
488    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
489      syntax &= ~RE_DOT_NEWLINE;
490      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
491      /* It also changes the matching behavior.  */
492      preg->newline_anchor = 1;
493    }
494  else
495    preg->newline_anchor = 0;
496  preg->no_sub = !!(cflags & REG_NOSUB);
497  preg->translate = NULL;
498
499  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
500
501  /* POSIX doesn't distinguish between an unmatched open-group and an
502     unmatched close-group: both are REG_EPAREN.  */
503  if (ret == REG_ERPAREN)
504    ret = REG_EPAREN;
505
506  /* We have already checked preg->fastmap != NULL.  */
507  if (BE (ret == REG_NOERROR, 1))
508    /* Compute the fastmap now, since regexec cannot modify the pattern
509       buffer.  This function never fails in this implementation.  */
510    (void) re_compile_fastmap (preg);
511  else
512    {
513      /* Some error occurred while compiling the expression.  */
514      re_free (preg->fastmap);
515      preg->fastmap = NULL;
516    }
517
518  return (int) ret;
519}
520#ifdef _LIBC
521weak_alias (__regcomp, regcomp)
522#endif
523
524/* Returns a message corresponding to an error code, ERRCODE, returned
525   from either regcomp or regexec.   We don't use PREG here.  */
526
527size_t
528regerror (errcode, preg, errbuf, errbuf_size)
529    int errcode;
530    const regex_t *__restrict preg;
531    char *__restrict errbuf;
532    size_t errbuf_size;
533{
534  const char *msg;
535  size_t msg_size;
536
537  if (BE (errcode < 0
538	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
539			       / sizeof (__re_error_msgid_idx[0])), 0))
540    /* Only error codes returned by the rest of the code should be passed
541       to this routine.  If we are given anything else, or if other regex
542       code generates an invalid error code, then the program has a bug.
543       Dump core so we can fix it.  */
544    abort ();
545
546  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
547
548  msg_size = strlen (msg) + 1; /* Includes the null.  */
549
550  if (BE (errbuf_size != 0, 1))
551    {
552      if (BE (msg_size > errbuf_size, 0))
553	{
554#if defined HAVE_MEMPCPY || defined _LIBC
555	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
556#else
557	  memcpy (errbuf, msg, errbuf_size - 1);
558	  errbuf[errbuf_size - 1] = 0;
559#endif
560	}
561      else
562	memcpy (errbuf, msg, msg_size);
563    }
564
565  return msg_size;
566}
567#ifdef _LIBC
568weak_alias (__regerror, regerror)
569#endif
570
571
572#ifdef RE_ENABLE_I18N
573/* This static array is used for the map to single-byte characters when
574   UTF-8 is used.  Otherwise we would allocate memory just to initialize
575   it the same all the time.  UTF-8 is the preferred encoding so this is
576   a worthwhile optimization.  */
577static const bitset_t utf8_sb_map =
578{
579  /* Set the first 128 bits.  */
580  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
581};
582#endif
583
584
585static void
586free_dfa_content (re_dfa_t *dfa)
587{
588  int i, j;
589
590  if (dfa->nodes)
591    for (i = 0; i < dfa->nodes_len; ++i)
592      free_token (dfa->nodes + i);
593  re_free (dfa->nexts);
594  for (i = 0; i < dfa->nodes_len; ++i)
595    {
596      if (dfa->eclosures != NULL)
597	re_node_set_free (dfa->eclosures + i);
598      if (dfa->inveclosures != NULL)
599	re_node_set_free (dfa->inveclosures + i);
600      if (dfa->edests != NULL)
601	re_node_set_free (dfa->edests + i);
602    }
603  re_free (dfa->edests);
604  re_free (dfa->eclosures);
605  re_free (dfa->inveclosures);
606  re_free (dfa->nodes);
607
608  if (dfa->state_table)
609    for (i = 0; i <= dfa->state_hash_mask; ++i)
610      {
611	struct re_state_table_entry *entry = dfa->state_table + i;
612	for (j = 0; j < entry->num; ++j)
613	  {
614	    re_dfastate_t *state = entry->array[j];
615	    free_state (state);
616	  }
617	re_free (entry->array);
618      }
619  re_free (dfa->state_table);
620#ifdef RE_ENABLE_I18N
621  if (dfa->sb_char != utf8_sb_map)
622    re_free (dfa->sb_char);
623#endif
624  re_free (dfa->subexp_map);
625#ifdef DEBUG
626  re_free (dfa->re_str);
627#endif
628
629  re_free (dfa);
630}
631
632
633/* Free dynamically allocated space used by PREG.  */
634
635void
636regfree (preg)
637    regex_t *preg;
638{
639  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
640  if (BE (dfa != NULL, 1))
641    free_dfa_content (dfa);
642  preg->buffer = NULL;
643  preg->allocated = 0;
644
645  re_free (preg->fastmap);
646  preg->fastmap = NULL;
647
648  re_free (preg->translate);
649  preg->translate = NULL;
650}
651#ifdef _LIBC
652weak_alias (__regfree, regfree)
653#endif
654
655/* Entry points compatible with 4.2 BSD regex library.  We don't define
656   them unless specifically requested.  */
657
658#if defined _REGEX_RE_COMP || defined _LIBC
659
660/* BSD has one and only one pattern buffer.  */
661static struct re_pattern_buffer re_comp_buf;
662
663char *
664# ifdef _LIBC
665/* Make these definitions weak in libc, so POSIX programs can redefine
666   these names if they don't use our functions, and still use
667   regcomp/regexec above without link errors.  */
668weak_function
669# endif
670re_comp (s)
671     const char *s;
672{
673  reg_errcode_t ret;
674  char *fastmap;
675
676  if (!s)
677    {
678      if (!re_comp_buf.buffer)
679	return gettext ("No previous regular expression");
680      return 0;
681    }
682
683  if (re_comp_buf.buffer)
684    {
685      fastmap = re_comp_buf.fastmap;
686      re_comp_buf.fastmap = NULL;
687      __regfree (&re_comp_buf);
688      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
689      re_comp_buf.fastmap = fastmap;
690    }
691
692  if (re_comp_buf.fastmap == NULL)
693    {
694      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
695      if (re_comp_buf.fastmap == NULL)
696	return (char *) gettext (__re_error_msgid
697				 + __re_error_msgid_idx[(int) REG_ESPACE]);
698    }
699
700  /* Since `re_exec' always passes NULL for the `regs' argument, we
701     don't need to initialize the pattern buffer fields which affect it.  */
702
703  /* Match anchors at newlines.  */
704  re_comp_buf.newline_anchor = 1;
705
706  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
707
708  if (!ret)
709    return NULL;
710
711  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
712  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
713}
714
715#ifdef _LIBC
716libc_freeres_fn (free_mem)
717{
718  __regfree (&re_comp_buf);
719}
720#endif
721
722#endif /* _REGEX_RE_COMP */
723
724/* Internal entry point.
725   Compile the regular expression PATTERN, whose length is LENGTH.
726   SYNTAX indicate regular expression's syntax.  */
727
728static reg_errcode_t
729re_compile_internal (regex_t *preg, const char * pattern, size_t length,
730		     reg_syntax_t syntax)
731{
732  reg_errcode_t err = REG_NOERROR;
733  re_dfa_t *dfa;
734  re_string_t regexp;
735
736  /* Initialize the pattern buffer.  */
737  preg->fastmap_accurate = 0;
738  preg->syntax = syntax;
739  preg->not_bol = preg->not_eol = 0;
740  preg->used = 0;
741  preg->re_nsub = 0;
742  preg->can_be_null = 0;
743  preg->regs_allocated = REGS_UNALLOCATED;
744
745  /* Initialize the dfa.  */
746  dfa = (re_dfa_t *) preg->buffer;
747  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
748    {
749      /* If zero allocated, but buffer is non-null, try to realloc
750	 enough space.  This loses if buffer's address is bogus, but
751	 that is the user's responsibility.  If ->buffer is NULL this
752	 is a simple allocation.  */
753      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
754      if (dfa == NULL)
755	return REG_ESPACE;
756      preg->allocated = sizeof (re_dfa_t);
757      preg->buffer = (unsigned char *) dfa;
758    }
759  preg->used = sizeof (re_dfa_t);
760
761  err = init_dfa (dfa, length);
762  if (BE (err != REG_NOERROR, 0))
763    {
764      free_dfa_content (dfa);
765      preg->buffer = NULL;
766      preg->allocated = 0;
767      return err;
768    }
769#ifdef DEBUG
770  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
771  dfa->re_str = re_malloc (char, length + 1);
772  strncpy (dfa->re_str, pattern, length + 1);
773#endif
774
775  __libc_lock_init (dfa->lock);
776
777  err = re_string_construct (&regexp, pattern, length, preg->translate,
778			     syntax & RE_ICASE, dfa);
779  if (BE (err != REG_NOERROR, 0))
780    {
781    re_compile_internal_free_return:
782      free_workarea_compile (preg);
783      re_string_destruct (&regexp);
784      free_dfa_content (dfa);
785      preg->buffer = NULL;
786      preg->allocated = 0;
787      return err;
788    }
789
790  /* Parse the regular expression, and build a structure tree.  */
791  preg->re_nsub = 0;
792  dfa->str_tree = parse (&regexp, preg, syntax, &err);
793  if (BE (dfa->str_tree == NULL, 0))
794    goto re_compile_internal_free_return;
795
796  /* Analyze the tree and create the nfa.  */
797  err = analyze (preg);
798  if (BE (err != REG_NOERROR, 0))
799    goto re_compile_internal_free_return;
800
801#ifdef RE_ENABLE_I18N
802  /* If possible, do searching in single byte encoding to speed things up.  */
803  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
804    optimize_utf8 (dfa);
805#endif
806
807  /* Then create the initial state of the dfa.  */
808  err = create_initial_state (dfa);
809
810  /* Release work areas.  */
811  free_workarea_compile (preg);
812  re_string_destruct (&regexp);
813
814  if (BE (err != REG_NOERROR, 0))
815    {
816      free_dfa_content (dfa);
817      preg->buffer = NULL;
818      preg->allocated = 0;
819    }
820
821  return err;
822}
823
824/* Initialize DFA.  We use the length of the regular expression PAT_LEN
825   as the initial length of some arrays.  */
826
827static reg_errcode_t
828init_dfa (re_dfa_t *dfa, size_t pat_len)
829{
830  unsigned int table_size;
831#ifndef _LIBC
832  char *codeset_name;
833#endif
834
835  memset (dfa, '\0', sizeof (re_dfa_t));
836
837  /* Force allocation of str_tree_storage the first time.  */
838  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
839
840  /* Avoid overflows.  */
841  if (pat_len == SIZE_MAX)
842    return REG_ESPACE;
843
844  dfa->nodes_alloc = pat_len + 1;
845  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
846
847  /*  table_size = 2 ^ ceil(log pat_len) */
848  for (table_size = 1; ; table_size <<= 1)
849    if (table_size > pat_len)
850      break;
851
852  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
853  dfa->state_hash_mask = table_size - 1;
854
855  dfa->mb_cur_max = MB_CUR_MAX;
856#ifdef _LIBC
857  if (dfa->mb_cur_max == 6
858      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
859    dfa->is_utf8 = 1;
860  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
861		       != 0);
862#else
863# ifdef HAVE_LANGINFO_CODESET
864  codeset_name = nl_langinfo (CODESET);
865# else
866  codeset_name = getenv ("LC_ALL");
867  if (codeset_name == NULL || codeset_name[0] == '\0')
868    codeset_name = getenv ("LC_CTYPE");
869  if (codeset_name == NULL || codeset_name[0] == '\0')
870    codeset_name = getenv ("LANG");
871  if (codeset_name == NULL)
872    codeset_name = "";
873  else if (strchr (codeset_name, '.') !=  NULL)
874    codeset_name = strchr (codeset_name, '.') + 1;
875# endif
876
877  if (strcasecmp (codeset_name, "UTF-8") == 0
878      || strcasecmp (codeset_name, "UTF8") == 0)
879    dfa->is_utf8 = 1;
880
881  /* We check exhaustively in the loop below if this charset is a
882     superset of ASCII.  */
883  dfa->map_notascii = 0;
884#endif
885
886#ifdef RE_ENABLE_I18N
887  if (dfa->mb_cur_max > 1)
888    {
889      if (dfa->is_utf8)
890	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
891      else
892	{
893	  int i, j, ch;
894
895	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
896	  if (BE (dfa->sb_char == NULL, 0))
897	    return REG_ESPACE;
898
899	  /* Set the bits corresponding to single byte chars.  */
900	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
902	      {
903		wint_t wch = __btowc (ch);
904		if (wch != WEOF)
905		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
906# ifndef _LIBC
907		if (isascii (ch) && wch != ch)
908		  dfa->map_notascii = 1;
909# endif
910	      }
911	}
912    }
913#endif
914
915  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
916    return REG_ESPACE;
917  return REG_NOERROR;
918}
919
920/* Initialize WORD_CHAR table, which indicate which character is
921   "word".  In this case "word" means that it is the word construction
922   character used by some operators like "\<", "\>", etc.  */
923
924static void
925internal_function
926init_word_char (re_dfa_t *dfa)
927{
928  dfa->word_ops_used = 1;
929  int i = 0;
930  int ch = 0;
931  if (BE (dfa->map_notascii == 0, 1))
932    {
933      if (sizeof (dfa->word_char[0]) == 8)
934	{
935          /* The extra temporaries here avoid "implicitly truncated"
936             warnings in the case when this is dead code, i.e. 32-bit.  */
937          const uint64_t wc0 = UINT64_C (0x03ff000000000000);
938          const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
939	  dfa->word_char[0] = wc0;
940	  dfa->word_char[1] = wc1;
941	  i = 2;
942	}
943      else if (sizeof (dfa->word_char[0]) == 4)
944	{
945	  dfa->word_char[0] = UINT32_C (0x00000000);
946	  dfa->word_char[1] = UINT32_C (0x03ff0000);
947	  dfa->word_char[2] = UINT32_C (0x87fffffe);
948	  dfa->word_char[3] = UINT32_C (0x07fffffe);
949	  i = 4;
950	}
951      else
952	abort ();
953      ch = 128;
954
955      if (BE (dfa->is_utf8, 1))
956	{
957	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
958	  return;
959	}
960    }
961
962  for (; i < BITSET_WORDS; ++i)
963    for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
964      if (isalnum (ch) || ch == '_')
965	dfa->word_char[i] |= (bitset_word_t) 1 << j;
966}
967
968/* Free the work area which are only used while compiling.  */
969
970static void
971free_workarea_compile (regex_t *preg)
972{
973  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
974  bin_tree_storage_t *storage, *next;
975  for (storage = dfa->str_tree_storage; storage; storage = next)
976    {
977      next = storage->next;
978      re_free (storage);
979    }
980  dfa->str_tree_storage = NULL;
981  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
982  dfa->str_tree = NULL;
983  re_free (dfa->org_indices);
984  dfa->org_indices = NULL;
985}
986
987/* Create initial states for all contexts.  */
988
989static reg_errcode_t
990create_initial_state (re_dfa_t *dfa)
991{
992  int first, i;
993  reg_errcode_t err;
994  re_node_set init_nodes;
995
996  /* Initial states have the epsilon closure of the node which is
997     the first node of the regular expression.  */
998  first = dfa->str_tree->first->node_idx;
999  dfa->init_node = first;
1000  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1001  if (BE (err != REG_NOERROR, 0))
1002    return err;
1003
1004  /* The back-references which are in initial states can epsilon transit,
1005     since in this case all of the subexpressions can be null.
1006     Then we add epsilon closures of the nodes which are the next nodes of
1007     the back-references.  */
1008  if (dfa->nbackref > 0)
1009    for (i = 0; i < init_nodes.nelem; ++i)
1010      {
1011	int node_idx = init_nodes.elems[i];
1012	re_token_type_t type = dfa->nodes[node_idx].type;
1013
1014	int clexp_idx;
1015	if (type != OP_BACK_REF)
1016	  continue;
1017	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1018	  {
1019	    re_token_t *clexp_node;
1020	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1021	    if (clexp_node->type == OP_CLOSE_SUBEXP
1022		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1023	      break;
1024	  }
1025	if (clexp_idx == init_nodes.nelem)
1026	  continue;
1027
1028	if (type == OP_BACK_REF)
1029	  {
1030	    int dest_idx = dfa->edests[node_idx].elems[0];
1031	    if (!re_node_set_contains (&init_nodes, dest_idx))
1032	      {
1033		reg_errcode_t err = re_node_set_merge (&init_nodes,
1034						       dfa->eclosures
1035						       + dest_idx);
1036		if (err != REG_NOERROR)
1037		  return err;
1038		i = 0;
1039	      }
1040	  }
1041      }
1042
1043  /* It must be the first time to invoke acquire_state.  */
1044  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1045  /* We don't check ERR here, since the initial state must not be NULL.  */
1046  if (BE (dfa->init_state == NULL, 0))
1047    return err;
1048  if (dfa->init_state->has_constraint)
1049    {
1050      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1051						       CONTEXT_WORD);
1052      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1053						     CONTEXT_NEWLINE);
1054      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1055							 &init_nodes,
1056							 CONTEXT_NEWLINE
1057							 | CONTEXT_BEGBUF);
1058      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1059	      || dfa->init_state_begbuf == NULL, 0))
1060	return err;
1061    }
1062  else
1063    dfa->init_state_word = dfa->init_state_nl
1064      = dfa->init_state_begbuf = dfa->init_state;
1065
1066  re_node_set_free (&init_nodes);
1067  return REG_NOERROR;
1068}
1069
1070#ifdef RE_ENABLE_I18N
1071/* If it is possible to do searching in single byte encoding instead of UTF-8
1072   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1073   DFA nodes where needed.  */
1074
1075static void
1076optimize_utf8 (re_dfa_t *dfa)
1077{
1078  int node, i, mb_chars = 0, has_period = 0;
1079
1080  for (node = 0; node < dfa->nodes_len; ++node)
1081    switch (dfa->nodes[node].type)
1082      {
1083      case CHARACTER:
1084	if (dfa->nodes[node].opr.c >= 0x80)
1085	  mb_chars = 1;
1086	break;
1087      case ANCHOR:
1088	switch (dfa->nodes[node].opr.ctx_type)
1089	  {
1090	  case LINE_FIRST:
1091	  case LINE_LAST:
1092	  case BUF_FIRST:
1093	  case BUF_LAST:
1094	    break;
1095	  default:
1096	    /* Word anchors etc. cannot be handled.  It's okay to test
1097	       opr.ctx_type since constraints (for all DFA nodes) are
1098	       created by ORing one or more opr.ctx_type values.  */
1099	    return;
1100	  }
1101	break;
1102      case OP_PERIOD:
1103	has_period = 1;
1104	break;
1105      case OP_BACK_REF:
1106      case OP_ALT:
1107      case END_OF_RE:
1108      case OP_DUP_ASTERISK:
1109      case OP_OPEN_SUBEXP:
1110      case OP_CLOSE_SUBEXP:
1111	break;
1112      case COMPLEX_BRACKET:
1113	return;
1114      case SIMPLE_BRACKET:
1115	/* Just double check.  The non-ASCII range starts at 0x80.  */
1116	assert (0x80 % BITSET_WORD_BITS == 0);
1117	for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1118	  if (dfa->nodes[node].opr.sbcset[i])
1119	    return;
1120	break;
1121      default:
1122	abort ();
1123      }
1124
1125  if (mb_chars || has_period)
1126    for (node = 0; node < dfa->nodes_len; ++node)
1127      {
1128	if (dfa->nodes[node].type == CHARACTER
1129	    && dfa->nodes[node].opr.c >= 0x80)
1130	  dfa->nodes[node].mb_partial = 0;
1131	else if (dfa->nodes[node].type == OP_PERIOD)
1132	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1133      }
1134
1135  /* The search can be in single byte locale.  */
1136  dfa->mb_cur_max = 1;
1137  dfa->is_utf8 = 0;
1138  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1139}
1140#endif
1141
1142/* Analyze the structure tree, and calculate "first", "next", "edest",
1143   "eclosure", and "inveclosure".  */
1144
1145static reg_errcode_t
1146analyze (regex_t *preg)
1147{
1148  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1149  reg_errcode_t ret;
1150
1151  /* Allocate arrays.  */
1152  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1153  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1154  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1155  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1156  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1157	  || dfa->eclosures == NULL, 0))
1158    return REG_ESPACE;
1159
1160  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1161  if (dfa->subexp_map != NULL)
1162    {
1163      int i;
1164      for (i = 0; i < preg->re_nsub; i++)
1165	dfa->subexp_map[i] = i;
1166      preorder (dfa->str_tree, optimize_subexps, dfa);
1167      for (i = 0; i < preg->re_nsub; i++)
1168	if (dfa->subexp_map[i] != i)
1169	  break;
1170      if (i == preg->re_nsub)
1171	{
1172	  free (dfa->subexp_map);
1173	  dfa->subexp_map = NULL;
1174	}
1175    }
1176
1177  ret = postorder (dfa->str_tree, lower_subexps, preg);
1178  if (BE (ret != REG_NOERROR, 0))
1179    return ret;
1180  ret = postorder (dfa->str_tree, calc_first, dfa);
1181  if (BE (ret != REG_NOERROR, 0))
1182    return ret;
1183  preorder (dfa->str_tree, calc_next, dfa);
1184  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1185  if (BE (ret != REG_NOERROR, 0))
1186    return ret;
1187  ret = calc_eclosure (dfa);
1188  if (BE (ret != REG_NOERROR, 0))
1189    return ret;
1190
1191  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1193  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1194      || dfa->nbackref)
1195    {
1196      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1197      if (BE (dfa->inveclosures == NULL, 0))
1198	return REG_ESPACE;
1199      ret = calc_inveclosure (dfa);
1200    }
1201
1202  return ret;
1203}
1204
1205/* Our parse trees are very unbalanced, so we cannot use a stack to
1206   implement parse tree visits.  Instead, we use parent pointers and
1207   some hairy code in these two functions.  */
1208static reg_errcode_t
1209postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1210	   void *extra)
1211{
1212  bin_tree_t *node, *prev;
1213
1214  for (node = root; ; )
1215    {
1216      /* Descend down the tree, preferably to the left (or to the right
1217	 if that's the only child).  */
1218      while (node->left || node->right)
1219	if (node->left)
1220	  node = node->left;
1221	else
1222	  node = node->right;
1223
1224      do
1225	{
1226	  reg_errcode_t err = fn (extra, node);
1227	  if (BE (err != REG_NOERROR, 0))
1228	    return err;
1229	  if (node->parent == NULL)
1230	    return REG_NOERROR;
1231	  prev = node;
1232	  node = node->parent;
1233	}
1234      /* Go up while we have a node that is reached from the right.  */
1235      while (node->right == prev || node->right == NULL);
1236      node = node->right;
1237    }
1238}
1239
1240static reg_errcode_t
1241preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242	  void *extra)
1243{
1244  bin_tree_t *node;
1245
1246  for (node = root; ; )
1247    {
1248      reg_errcode_t err = fn (extra, node);
1249      if (BE (err != REG_NOERROR, 0))
1250	return err;
1251
1252      /* Go to the left node, or up and to the right.  */
1253      if (node->left)
1254	node = node->left;
1255      else
1256	{
1257	  bin_tree_t *prev = NULL;
1258	  while (node->right == prev || node->right == NULL)
1259	    {
1260	      prev = node;
1261	      node = node->parent;
1262	      if (!node)
1263		return REG_NOERROR;
1264	    }
1265	  node = node->right;
1266	}
1267    }
1268}
1269
1270/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1272   backreferences as well.  Requires a preorder visit.  */
1273static reg_errcode_t
1274optimize_subexps (void *extra, bin_tree_t *node)
1275{
1276  re_dfa_t *dfa = (re_dfa_t *) extra;
1277
1278  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1279    {
1280      int idx = node->token.opr.idx;
1281      node->token.opr.idx = dfa->subexp_map[idx];
1282      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1283    }
1284
1285  else if (node->token.type == SUBEXP
1286	   && node->left && node->left->token.type == SUBEXP)
1287    {
1288      int other_idx = node->left->token.opr.idx;
1289
1290      node->left = node->left->left;
1291      if (node->left)
1292	node->left->parent = node;
1293
1294      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295      if (other_idx < BITSET_WORD_BITS)
1296	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1297    }
1298
1299  return REG_NOERROR;
1300}
1301
1302/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1304static reg_errcode_t
1305lower_subexps (void *extra, bin_tree_t *node)
1306{
1307  regex_t *preg = (regex_t *) extra;
1308  reg_errcode_t err = REG_NOERROR;
1309
1310  if (node->left && node->left->token.type == SUBEXP)
1311    {
1312      node->left = lower_subexp (&err, preg, node->left);
1313      if (node->left)
1314	node->left->parent = node;
1315    }
1316  if (node->right && node->right->token.type == SUBEXP)
1317    {
1318      node->right = lower_subexp (&err, preg, node->right);
1319      if (node->right)
1320	node->right->parent = node;
1321    }
1322
1323  return err;
1324}
1325
1326static bin_tree_t *
1327lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1328{
1329  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1330  bin_tree_t *body = node->left;
1331  bin_tree_t *op, *cls, *tree1, *tree;
1332
1333  if (preg->no_sub
1334      /* We do not optimize empty subexpressions, because otherwise we may
1335	 have bad CONCAT nodes with NULL children.  This is obviously not
1336	 very common, so we do not lose much.  An example that triggers
1337	 this case is the sed "script" /\(\)/x.  */
1338      && node->left != NULL
1339      && (node->token.opr.idx >= BITSET_WORD_BITS
1340	  || !(dfa->used_bkref_map
1341	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1342    return node->left;
1343
1344  /* Convert the SUBEXP node to the concatenation of an
1345     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1346  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349  tree = create_tree (dfa, op, tree1, CONCAT);
1350  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1351    {
1352      *err = REG_ESPACE;
1353      return NULL;
1354    }
1355
1356  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358  return tree;
1359}
1360
1361/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362   nodes.  Requires a postorder visit.  */
1363static reg_errcode_t
1364calc_first (void *extra, bin_tree_t *node)
1365{
1366  re_dfa_t *dfa = (re_dfa_t *) extra;
1367  if (node->token.type == CONCAT)
1368    {
1369      node->first = node->left->first;
1370      node->node_idx = node->left->node_idx;
1371    }
1372  else
1373    {
1374      node->first = node;
1375      node->node_idx = re_dfa_add_node (dfa, node->token);
1376      if (BE (node->node_idx == -1, 0))
1377	return REG_ESPACE;
1378      if (node->token.type == ANCHOR)
1379	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1380    }
1381  return REG_NOERROR;
1382}
1383
1384/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1385static reg_errcode_t
1386calc_next (void *extra, bin_tree_t *node)
1387{
1388  switch (node->token.type)
1389    {
1390    case OP_DUP_ASTERISK:
1391      node->left->next = node;
1392      break;
1393    case CONCAT:
1394      node->left->next = node->right->first;
1395      node->right->next = node->next;
1396      break;
1397    default:
1398      if (node->left)
1399	node->left->next = node->next;
1400      if (node->right)
1401	node->right->next = node->next;
1402      break;
1403    }
1404  return REG_NOERROR;
1405}
1406
1407/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1408static reg_errcode_t
1409link_nfa_nodes (void *extra, bin_tree_t *node)
1410{
1411  re_dfa_t *dfa = (re_dfa_t *) extra;
1412  int idx = node->node_idx;
1413  reg_errcode_t err = REG_NOERROR;
1414
1415  switch (node->token.type)
1416    {
1417    case CONCAT:
1418      break;
1419
1420    case END_OF_RE:
1421      assert (node->next == NULL);
1422      break;
1423
1424    case OP_DUP_ASTERISK:
1425    case OP_ALT:
1426      {
1427	int left, right;
1428	dfa->has_plural_match = 1;
1429	if (node->left != NULL)
1430	  left = node->left->first->node_idx;
1431	else
1432	  left = node->next->node_idx;
1433	if (node->right != NULL)
1434	  right = node->right->first->node_idx;
1435	else
1436	  right = node->next->node_idx;
1437	assert (left > -1);
1438	assert (right > -1);
1439	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1440      }
1441      break;
1442
1443    case ANCHOR:
1444    case OP_OPEN_SUBEXP:
1445    case OP_CLOSE_SUBEXP:
1446      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1447      break;
1448
1449    case OP_BACK_REF:
1450      dfa->nexts[idx] = node->next->node_idx;
1451      if (node->token.type == OP_BACK_REF)
1452	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1453      break;
1454
1455    default:
1456      assert (!IS_EPSILON_NODE (node->token.type));
1457      dfa->nexts[idx] = node->next->node_idx;
1458      break;
1459    }
1460
1461  return err;
1462}
1463
1464/* Duplicate the epsilon closure of the node ROOT_NODE.
1465   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466   to their own constraint.  */
1467
1468static reg_errcode_t
1469internal_function
1470duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1471			int root_node, unsigned int init_constraint)
1472{
1473  int org_node, clone_node, ret;
1474  unsigned int constraint = init_constraint;
1475  for (org_node = top_org_node, clone_node = top_clone_node;;)
1476    {
1477      int org_dest, clone_dest;
1478      if (dfa->nodes[org_node].type == OP_BACK_REF)
1479	{
1480	  /* If the back reference epsilon-transit, its destination must
1481	     also have the constraint.  Then duplicate the epsilon closure
1482	     of the destination of the back reference, and store it in
1483	     edests of the back reference.  */
1484	  org_dest = dfa->nexts[org_node];
1485	  re_node_set_empty (dfa->edests + clone_node);
1486	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1487	  if (BE (clone_dest == -1, 0))
1488	    return REG_ESPACE;
1489	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1490	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491	  if (BE (ret < 0, 0))
1492	    return REG_ESPACE;
1493	}
1494      else if (dfa->edests[org_node].nelem == 0)
1495	{
1496	  /* In case of the node can't epsilon-transit, don't duplicate the
1497	     destination and store the original destination as the
1498	     destination of the node.  */
1499	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1500	  break;
1501	}
1502      else if (dfa->edests[org_node].nelem == 1)
1503	{
1504	  /* In case of the node can epsilon-transit, and it has only one
1505	     destination.  */
1506	  org_dest = dfa->edests[org_node].elems[0];
1507	  re_node_set_empty (dfa->edests + clone_node);
1508	  /* If the node is root_node itself, it means the epsilon clsoure
1509	     has a loop.   Then tie it to the destination of the root_node.  */
1510	  if (org_node == root_node && clone_node != org_node)
1511	    {
1512	      ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1513	      if (BE (ret < 0, 0))
1514		return REG_ESPACE;
1515	      break;
1516	    }
1517	  /* In case of the node has another constraint, add it.  */
1518	  constraint |= dfa->nodes[org_node].constraint;
1519	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1520	  if (BE (clone_dest == -1, 0))
1521	    return REG_ESPACE;
1522	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1523	  if (BE (ret < 0, 0))
1524	    return REG_ESPACE;
1525	}
1526      else /* dfa->edests[org_node].nelem == 2 */
1527	{
1528	  /* In case of the node can epsilon-transit, and it has two
1529	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1530	  org_dest = dfa->edests[org_node].elems[0];
1531	  re_node_set_empty (dfa->edests + clone_node);
1532	  /* Search for a duplicated node which satisfies the constraint.  */
1533	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1534	  if (clone_dest == -1)
1535	    {
1536	      /* There is no such duplicated node, create a new one.  */
1537	      reg_errcode_t err;
1538	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1539	      if (BE (clone_dest == -1, 0))
1540		return REG_ESPACE;
1541	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542	      if (BE (ret < 0, 0))
1543		return REG_ESPACE;
1544	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545					    root_node, constraint);
1546	      if (BE (err != REG_NOERROR, 0))
1547		return err;
1548	    }
1549	  else
1550	    {
1551	      /* There is a duplicated node which satisfies the constraint,
1552		 use it to avoid infinite loop.  */
1553	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554	      if (BE (ret < 0, 0))
1555		return REG_ESPACE;
1556	    }
1557
1558	  org_dest = dfa->edests[org_node].elems[1];
1559	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1560	  if (BE (clone_dest == -1, 0))
1561	    return REG_ESPACE;
1562	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563	  if (BE (ret < 0, 0))
1564	    return REG_ESPACE;
1565	}
1566      org_node = org_dest;
1567      clone_node = clone_dest;
1568    }
1569  return REG_NOERROR;
1570}
1571
1572/* Search for a node which is duplicated from the node ORG_NODE, and
1573   satisfies the constraint CONSTRAINT.  */
1574
1575static int
1576search_duplicated_node (const re_dfa_t *dfa, int org_node,
1577			unsigned int constraint)
1578{
1579  int idx;
1580  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1581    {
1582      if (org_node == dfa->org_indices[idx]
1583	  && constraint == dfa->nodes[idx].constraint)
1584	return idx; /* Found.  */
1585    }
1586  return -1; /* Not found.  */
1587}
1588
1589/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1590   Return the index of the new node, or -1 if insufficient storage is
1591   available.  */
1592
1593static int
1594duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1595{
1596  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597  if (BE (dup_idx != -1, 1))
1598    {
1599      dfa->nodes[dup_idx].constraint = constraint;
1600      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1601      dfa->nodes[dup_idx].duplicated = 1;
1602
1603      /* Store the index of the original node.  */
1604      dfa->org_indices[dup_idx] = org_idx;
1605    }
1606  return dup_idx;
1607}
1608
1609static reg_errcode_t
1610calc_inveclosure (re_dfa_t *dfa)
1611{
1612  int src, idx, ret;
1613  for (idx = 0; idx < dfa->nodes_len; ++idx)
1614    re_node_set_init_empty (dfa->inveclosures + idx);
1615
1616  for (src = 0; src < dfa->nodes_len; ++src)
1617    {
1618      int *elems = dfa->eclosures[src].elems;
1619      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1620	{
1621	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1622	  if (BE (ret == -1, 0))
1623	    return REG_ESPACE;
1624	}
1625    }
1626
1627  return REG_NOERROR;
1628}
1629
1630/* Calculate "eclosure" for all the node in DFA.  */
1631
1632static reg_errcode_t
1633calc_eclosure (re_dfa_t *dfa)
1634{
1635  int node_idx, incomplete;
1636#ifdef DEBUG
1637  assert (dfa->nodes_len > 0);
1638#endif
1639  incomplete = 0;
1640  /* For each nodes, calculate epsilon closure.  */
1641  for (node_idx = 0; ; ++node_idx)
1642    {
1643      reg_errcode_t err;
1644      re_node_set eclosure_elem;
1645      if (node_idx == dfa->nodes_len)
1646	{
1647	  if (!incomplete)
1648	    break;
1649	  incomplete = 0;
1650	  node_idx = 0;
1651	}
1652
1653#ifdef DEBUG
1654      assert (dfa->eclosures[node_idx].nelem != -1);
1655#endif
1656
1657      /* If we have already calculated, skip it.  */
1658      if (dfa->eclosures[node_idx].nelem != 0)
1659	continue;
1660      /* Calculate epsilon closure of `node_idx'.  */
1661      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1662      if (BE (err != REG_NOERROR, 0))
1663	return err;
1664
1665      if (dfa->eclosures[node_idx].nelem == 0)
1666	{
1667	  incomplete = 1;
1668	  re_node_set_free (&eclosure_elem);
1669	}
1670    }
1671  return REG_NOERROR;
1672}
1673
1674/* Calculate epsilon closure of NODE.  */
1675
1676static reg_errcode_t
1677calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1678{
1679  reg_errcode_t err;
1680  int i;
1681  re_node_set eclosure;
1682  int ret;
1683  int incomplete = 0;
1684  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1685  if (BE (err != REG_NOERROR, 0))
1686    return err;
1687
1688  /* This indicates that we are calculating this node now.
1689     We reference this value to avoid infinite loop.  */
1690  dfa->eclosures[node].nelem = -1;
1691
1692  /* If the current node has constraints, duplicate all nodes
1693     since they must inherit the constraints.  */
1694  if (dfa->nodes[node].constraint
1695      && dfa->edests[node].nelem
1696      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1697    {
1698      err = duplicate_node_closure (dfa, node, node, node,
1699				    dfa->nodes[node].constraint);
1700      if (BE (err != REG_NOERROR, 0))
1701	return err;
1702    }
1703
1704  /* Expand each epsilon destination nodes.  */
1705  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1706    for (i = 0; i < dfa->edests[node].nelem; ++i)
1707      {
1708	re_node_set eclosure_elem;
1709	int edest = dfa->edests[node].elems[i];
1710	/* If calculating the epsilon closure of `edest' is in progress,
1711	   return intermediate result.  */
1712	if (dfa->eclosures[edest].nelem == -1)
1713	  {
1714	    incomplete = 1;
1715	    continue;
1716	  }
1717	/* If we haven't calculated the epsilon closure of `edest' yet,
1718	   calculate now. Otherwise use calculated epsilon closure.  */
1719	if (dfa->eclosures[edest].nelem == 0)
1720	  {
1721	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1722	    if (BE (err != REG_NOERROR, 0))
1723	      return err;
1724	  }
1725	else
1726	  eclosure_elem = dfa->eclosures[edest];
1727	/* Merge the epsilon closure of `edest'.  */
1728	err = re_node_set_merge (&eclosure, &eclosure_elem);
1729	if (BE (err != REG_NOERROR, 0))
1730	  return err;
1731	/* If the epsilon closure of `edest' is incomplete,
1732	   the epsilon closure of this node is also incomplete.  */
1733	if (dfa->eclosures[edest].nelem == 0)
1734	  {
1735	    incomplete = 1;
1736	    re_node_set_free (&eclosure_elem);
1737	  }
1738      }
1739
1740  /* An epsilon closure includes itself.  */
1741  ret = re_node_set_insert (&eclosure, node);
1742  if (BE (ret < 0, 0))
1743    return REG_ESPACE;
1744  if (incomplete && !root)
1745    dfa->eclosures[node].nelem = 0;
1746  else
1747    dfa->eclosures[node] = eclosure;
1748  *new_set = eclosure;
1749  return REG_NOERROR;
1750}
1751
1752/* Functions for token which are used in the parser.  */
1753
1754/* Fetch a token from INPUT.
1755   We must not use this function inside bracket expressions.  */
1756
1757static void
1758internal_function
1759fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1760{
1761  re_string_skip_bytes (input, peek_token (result, input, syntax));
1762}
1763
1764/* Peek a token from INPUT, and return the length of the token.
1765   We must not use this function inside bracket expressions.  */
1766
1767static int
1768internal_function
1769peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1770{
1771  unsigned char c;
1772
1773  if (re_string_eoi (input))
1774    {
1775      token->type = END_OF_RE;
1776      return 0;
1777    }
1778
1779  c = re_string_peek_byte (input, 0);
1780  token->opr.c = c;
1781
1782  token->word_char = 0;
1783#ifdef RE_ENABLE_I18N
1784  token->mb_partial = 0;
1785  if (input->mb_cur_max > 1 &&
1786      !re_string_first_byte (input, re_string_cur_idx (input)))
1787    {
1788      token->type = CHARACTER;
1789      token->mb_partial = 1;
1790      return 1;
1791    }
1792#endif
1793  if (c == '\\')
1794    {
1795      unsigned char c2;
1796      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1797	{
1798	  token->type = BACK_SLASH;
1799	  return 1;
1800	}
1801
1802      c2 = re_string_peek_byte_case (input, 1);
1803      token->opr.c = c2;
1804      token->type = CHARACTER;
1805#ifdef RE_ENABLE_I18N
1806      if (input->mb_cur_max > 1)
1807	{
1808	  wint_t wc = re_string_wchar_at (input,
1809					  re_string_cur_idx (input) + 1);
1810	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1811	}
1812      else
1813#endif
1814	token->word_char = IS_WORD_CHAR (c2) != 0;
1815
1816      switch (c2)
1817	{
1818	case '|':
1819	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1820	    token->type = OP_ALT;
1821	  break;
1822	case '1': case '2': case '3': case '4': case '5':
1823	case '6': case '7': case '8': case '9':
1824	  if (!(syntax & RE_NO_BK_REFS))
1825	    {
1826	      token->type = OP_BACK_REF;
1827	      token->opr.idx = c2 - '1';
1828	    }
1829	  break;
1830	case '<':
1831	  if (!(syntax & RE_NO_GNU_OPS))
1832	    {
1833	      token->type = ANCHOR;
1834	      token->opr.ctx_type = WORD_FIRST;
1835	    }
1836	  break;
1837	case '>':
1838	  if (!(syntax & RE_NO_GNU_OPS))
1839	    {
1840	      token->type = ANCHOR;
1841	      token->opr.ctx_type = WORD_LAST;
1842	    }
1843	  break;
1844	case 'b':
1845	  if (!(syntax & RE_NO_GNU_OPS))
1846	    {
1847	      token->type = ANCHOR;
1848	      token->opr.ctx_type = WORD_DELIM;
1849	    }
1850	  break;
1851	case 'B':
1852	  if (!(syntax & RE_NO_GNU_OPS))
1853	    {
1854	      token->type = ANCHOR;
1855	      token->opr.ctx_type = NOT_WORD_DELIM;
1856	    }
1857	  break;
1858	case 'w':
1859	  if (!(syntax & RE_NO_GNU_OPS))
1860	    token->type = OP_WORD;
1861	  break;
1862	case 'W':
1863	  if (!(syntax & RE_NO_GNU_OPS))
1864	    token->type = OP_NOTWORD;
1865	  break;
1866	case 's':
1867	  if (!(syntax & RE_NO_GNU_OPS))
1868	    token->type = OP_SPACE;
1869	  break;
1870	case 'S':
1871	  if (!(syntax & RE_NO_GNU_OPS))
1872	    token->type = OP_NOTSPACE;
1873	  break;
1874	case '`':
1875	  if (!(syntax & RE_NO_GNU_OPS))
1876	    {
1877	      token->type = ANCHOR;
1878	      token->opr.ctx_type = BUF_FIRST;
1879	    }
1880	  break;
1881	case '\'':
1882	  if (!(syntax & RE_NO_GNU_OPS))
1883	    {
1884	      token->type = ANCHOR;
1885	      token->opr.ctx_type = BUF_LAST;
1886	    }
1887	  break;
1888	case '(':
1889	  if (!(syntax & RE_NO_BK_PARENS))
1890	    token->type = OP_OPEN_SUBEXP;
1891	  break;
1892	case ')':
1893	  if (!(syntax & RE_NO_BK_PARENS))
1894	    token->type = OP_CLOSE_SUBEXP;
1895	  break;
1896	case '+':
1897	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898	    token->type = OP_DUP_PLUS;
1899	  break;
1900	case '?':
1901	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902	    token->type = OP_DUP_QUESTION;
1903	  break;
1904	case '{':
1905	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906	    token->type = OP_OPEN_DUP_NUM;
1907	  break;
1908	case '}':
1909	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910	    token->type = OP_CLOSE_DUP_NUM;
1911	  break;
1912	default:
1913	  break;
1914	}
1915      return 2;
1916    }
1917
1918  token->type = CHARACTER;
1919#ifdef RE_ENABLE_I18N
1920  if (input->mb_cur_max > 1)
1921    {
1922      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1923      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1924    }
1925  else
1926#endif
1927    token->word_char = IS_WORD_CHAR (token->opr.c);
1928
1929  switch (c)
1930    {
1931    case '\n':
1932      if (syntax & RE_NEWLINE_ALT)
1933	token->type = OP_ALT;
1934      break;
1935    case '|':
1936      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1937	token->type = OP_ALT;
1938      break;
1939    case '*':
1940      token->type = OP_DUP_ASTERISK;
1941      break;
1942    case '+':
1943      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944	token->type = OP_DUP_PLUS;
1945      break;
1946    case '?':
1947      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948	token->type = OP_DUP_QUESTION;
1949      break;
1950    case '{':
1951      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952	token->type = OP_OPEN_DUP_NUM;
1953      break;
1954    case '}':
1955      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956	token->type = OP_CLOSE_DUP_NUM;
1957      break;
1958    case '(':
1959      if (syntax & RE_NO_BK_PARENS)
1960	token->type = OP_OPEN_SUBEXP;
1961      break;
1962    case ')':
1963      if (syntax & RE_NO_BK_PARENS)
1964	token->type = OP_CLOSE_SUBEXP;
1965      break;
1966    case '[':
1967      token->type = OP_OPEN_BRACKET;
1968      break;
1969    case '.':
1970      token->type = OP_PERIOD;
1971      break;
1972    case '^':
1973      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1974	  re_string_cur_idx (input) != 0)
1975	{
1976	  char prev = re_string_peek_byte (input, -1);
1977	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1978	    break;
1979	}
1980      token->type = ANCHOR;
1981      token->opr.ctx_type = LINE_FIRST;
1982      break;
1983    case '$':
1984      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1985	  re_string_cur_idx (input) + 1 != re_string_length (input))
1986	{
1987	  re_token_t next;
1988	  re_string_skip_bytes (input, 1);
1989	  peek_token (&next, input, syntax);
1990	  re_string_skip_bytes (input, -1);
1991	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1992	    break;
1993	}
1994      token->type = ANCHOR;
1995      token->opr.ctx_type = LINE_LAST;
1996      break;
1997    default:
1998      break;
1999    }
2000  return 1;
2001}
2002
2003/* Peek a token from INPUT, and return the length of the token.
2004   We must not use this function out of bracket expressions.  */
2005
2006static int
2007internal_function
2008peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2009{
2010  unsigned char c;
2011  if (re_string_eoi (input))
2012    {
2013      token->type = END_OF_RE;
2014      return 0;
2015    }
2016  c = re_string_peek_byte (input, 0);
2017  token->opr.c = c;
2018
2019#ifdef RE_ENABLE_I18N
2020  if (input->mb_cur_max > 1 &&
2021      !re_string_first_byte (input, re_string_cur_idx (input)))
2022    {
2023      token->type = CHARACTER;
2024      return 1;
2025    }
2026#endif /* RE_ENABLE_I18N */
2027
2028  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2029      && re_string_cur_idx (input) + 1 < re_string_length (input))
2030    {
2031      /* In this case, '\' escape a character.  */
2032      unsigned char c2;
2033      re_string_skip_bytes (input, 1);
2034      c2 = re_string_peek_byte (input, 0);
2035      token->opr.c = c2;
2036      token->type = CHARACTER;
2037      return 1;
2038    }
2039  if (c == '[') /* '[' is a special char in a bracket exps.  */
2040    {
2041      unsigned char c2;
2042      int token_len;
2043      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2044	c2 = re_string_peek_byte (input, 1);
2045      else
2046	c2 = 0;
2047      token->opr.c = c2;
2048      token_len = 2;
2049      switch (c2)
2050	{
2051	case '.':
2052	  token->type = OP_OPEN_COLL_ELEM;
2053	  break;
2054	case '=':
2055	  token->type = OP_OPEN_EQUIV_CLASS;
2056	  break;
2057	case ':':
2058	  if (syntax & RE_CHAR_CLASSES)
2059	    {
2060	      token->type = OP_OPEN_CHAR_CLASS;
2061	      break;
2062	    }
2063	  /* else fall through.  */
2064	default:
2065	  token->type = CHARACTER;
2066	  token->opr.c = c;
2067	  token_len = 1;
2068	  break;
2069	}
2070      return token_len;
2071    }
2072  switch (c)
2073    {
2074    case '-':
2075      token->type = OP_CHARSET_RANGE;
2076      break;
2077    case ']':
2078      token->type = OP_CLOSE_BRACKET;
2079      break;
2080    case '^':
2081      token->type = OP_NON_MATCH_LIST;
2082      break;
2083    default:
2084      token->type = CHARACTER;
2085    }
2086  return 1;
2087}
2088
2089/* Functions for parser.  */
2090
2091/* Entry point of the parser.
2092   Parse the regular expression REGEXP and return the structure tree.
2093   If an error is occured, ERR is set by error code, and return NULL.
2094   This function build the following tree, from regular expression <reg_exp>:
2095	   CAT
2096	   / \
2097	  /   \
2098   <reg_exp>  EOR
2099
2100   CAT means concatenation.
2101   EOR means end of regular expression.  */
2102
2103static bin_tree_t *
2104parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2105       reg_errcode_t *err)
2106{
2107  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2108  bin_tree_t *tree, *eor, *root;
2109  re_token_t current_token;
2110  dfa->syntax = syntax;
2111  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2112  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2113  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114    return NULL;
2115  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2116  if (tree != NULL)
2117    root = create_tree (dfa, tree, eor, CONCAT);
2118  else
2119    root = eor;
2120  if (BE (eor == NULL || root == NULL, 0))
2121    {
2122      *err = REG_ESPACE;
2123      return NULL;
2124    }
2125  return root;
2126}
2127
2128/* This function build the following tree, from regular expression
2129   <branch1>|<branch2>:
2130	   ALT
2131	   / \
2132	  /   \
2133   <branch1> <branch2>
2134
2135   ALT means alternative, which represents the operator `|'.  */
2136
2137static bin_tree_t *
2138parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2139	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2140{
2141  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2142  bin_tree_t *tree, *branch = NULL;
2143  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2144  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2145    return NULL;
2146
2147  while (token->type == OP_ALT)
2148    {
2149      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2150      if (token->type != OP_ALT && token->type != END_OF_RE
2151	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2152	{
2153	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2154	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2155	    return NULL;
2156	}
2157      else
2158	branch = NULL;
2159      tree = create_tree (dfa, tree, branch, OP_ALT);
2160      if (BE (tree == NULL, 0))
2161	{
2162	  *err = REG_ESPACE;
2163	  return NULL;
2164	}
2165    }
2166  return tree;
2167}
2168
2169/* This function build the following tree, from regular expression
2170   <exp1><exp2>:
2171	CAT
2172	/ \
2173       /   \
2174   <exp1> <exp2>
2175
2176   CAT means concatenation.  */
2177
2178static bin_tree_t *
2179parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2180	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2181{
2182  bin_tree_t *tree, *exp;
2183  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2185  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186    return NULL;
2187
2188  while (token->type != OP_ALT && token->type != END_OF_RE
2189	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2190    {
2191      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2192      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2193	{
2194	  if (tree != NULL)
2195	    postorder (tree, free_tree, NULL);
2196	  return NULL;
2197	}
2198      if (tree != NULL && exp != NULL)
2199	{
2200	  bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2201	  if (newtree == NULL)
2202	    {
2203	      postorder (exp, free_tree, NULL);
2204	      postorder (tree, free_tree, NULL);
2205	      *err = REG_ESPACE;
2206	      return NULL;
2207	    }
2208	  tree = newtree;
2209	}
2210      else if (tree == NULL)
2211	tree = exp;
2212      /* Otherwise exp == NULL, we don't need to create new tree.  */
2213    }
2214  return tree;
2215}
2216
2217/* This function build the following tree, from regular expression a*:
2218	 *
2219	 |
2220	 a
2221*/
2222
2223static bin_tree_t *
2224parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2226{
2227  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228  bin_tree_t *tree;
2229  switch (token->type)
2230    {
2231    case CHARACTER:
2232      tree = create_token_tree (dfa, NULL, NULL, token);
2233      if (BE (tree == NULL, 0))
2234	{
2235	  *err = REG_ESPACE;
2236	  return NULL;
2237	}
2238#ifdef RE_ENABLE_I18N
2239      if (dfa->mb_cur_max > 1)
2240	{
2241	  while (!re_string_eoi (regexp)
2242		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243	    {
2244	      bin_tree_t *mbc_remain;
2245	      fetch_token (token, regexp, syntax);
2246	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2249		{
2250		  *err = REG_ESPACE;
2251		  return NULL;
2252		}
2253	    }
2254	}
2255#endif
2256      break;
2257    case OP_OPEN_SUBEXP:
2258      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260	return NULL;
2261      break;
2262    case OP_OPEN_BRACKET:
2263      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265	return NULL;
2266      break;
2267    case OP_BACK_REF:
2268      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269	{
2270	  *err = REG_ESUBREG;
2271	  return NULL;
2272	}
2273      dfa->used_bkref_map |= 1 << token->opr.idx;
2274      tree = create_token_tree (dfa, NULL, NULL, token);
2275      if (BE (tree == NULL, 0))
2276	{
2277	  *err = REG_ESPACE;
2278	  return NULL;
2279	}
2280      ++dfa->nbackref;
2281      dfa->has_mb_node = 1;
2282      break;
2283    case OP_OPEN_DUP_NUM:
2284      if (syntax & RE_CONTEXT_INVALID_DUP)
2285	{
2286	  *err = REG_BADRPT;
2287	  return NULL;
2288	}
2289      /* FALLTHROUGH */
2290    case OP_DUP_ASTERISK:
2291    case OP_DUP_PLUS:
2292    case OP_DUP_QUESTION:
2293      if (syntax & RE_CONTEXT_INVALID_OPS)
2294	{
2295	  *err = REG_BADRPT;
2296	  return NULL;
2297	}
2298      else if (syntax & RE_CONTEXT_INDEP_OPS)
2299	{
2300	  fetch_token (token, regexp, syntax);
2301	  return parse_expression (regexp, preg, token, syntax, nest, err);
2302	}
2303      /* else fall through  */
2304    case OP_CLOSE_SUBEXP:
2305      if ((token->type == OP_CLOSE_SUBEXP) &&
2306	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307	{
2308	  *err = REG_ERPAREN;
2309	  return NULL;
2310	}
2311      /* else fall through  */
2312    case OP_CLOSE_DUP_NUM:
2313      /* We treat it as a normal character.  */
2314
2315      /* Then we can these characters as normal characters.  */
2316      token->type = CHARACTER;
2317      /* mb_partial and word_char bits should be initialized already
2318	 by peek_token.  */
2319      tree = create_token_tree (dfa, NULL, NULL, token);
2320      if (BE (tree == NULL, 0))
2321	{
2322	  *err = REG_ESPACE;
2323	  return NULL;
2324	}
2325      break;
2326    case ANCHOR:
2327      if ((token->opr.ctx_type
2328	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329	  && dfa->word_ops_used == 0)
2330	init_word_char (dfa);
2331      if (token->opr.ctx_type == WORD_DELIM
2332	  || token->opr.ctx_type == NOT_WORD_DELIM)
2333	{
2334	  bin_tree_t *tree_first, *tree_last;
2335	  if (token->opr.ctx_type == WORD_DELIM)
2336	    {
2337	      token->opr.ctx_type = WORD_FIRST;
2338	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2339	      token->opr.ctx_type = WORD_LAST;
2340	    }
2341	  else
2342	    {
2343	      token->opr.ctx_type = INSIDE_WORD;
2344	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2345	      token->opr.ctx_type = INSIDE_NOTWORD;
2346	    }
2347	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2348	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350	    {
2351	      *err = REG_ESPACE;
2352	      return NULL;
2353	    }
2354	}
2355      else
2356	{
2357	  tree = create_token_tree (dfa, NULL, NULL, token);
2358	  if (BE (tree == NULL, 0))
2359	    {
2360	      *err = REG_ESPACE;
2361	      return NULL;
2362	    }
2363	}
2364      /* We must return here, since ANCHORs can't be followed
2365	 by repetition operators.
2366	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2368      fetch_token (token, regexp, syntax);
2369      return tree;
2370    case OP_PERIOD:
2371      tree = create_token_tree (dfa, NULL, NULL, token);
2372      if (BE (tree == NULL, 0))
2373	{
2374	  *err = REG_ESPACE;
2375	  return NULL;
2376	}
2377      if (dfa->mb_cur_max > 1)
2378	dfa->has_mb_node = 1;
2379      break;
2380    case OP_WORD:
2381    case OP_NOTWORD:
2382      tree = build_charclass_op (dfa, regexp->trans,
2383				 (const unsigned char *) "alnum",
2384				 (const unsigned char *) "_",
2385				 token->type == OP_NOTWORD, err);
2386      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387	return NULL;
2388      break;
2389    case OP_SPACE:
2390    case OP_NOTSPACE:
2391      tree = build_charclass_op (dfa, regexp->trans,
2392				 (const unsigned char *) "space",
2393				 (const unsigned char *) "",
2394				 token->type == OP_NOTSPACE, err);
2395      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396	return NULL;
2397      break;
2398    case OP_ALT:
2399    case END_OF_RE:
2400      return NULL;
2401    case BACK_SLASH:
2402      *err = REG_EESCAPE;
2403      return NULL;
2404    default:
2405      /* Must not happen?  */
2406#ifdef DEBUG
2407      assert (0);
2408#endif
2409      return NULL;
2410    }
2411  fetch_token (token, regexp, syntax);
2412
2413  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415    {
2416      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418	return NULL;
2419      /* In BRE consecutive duplications are not allowed.  */
2420      if ((syntax & RE_CONTEXT_INVALID_DUP)
2421	  && (token->type == OP_DUP_ASTERISK
2422	      || token->type == OP_OPEN_DUP_NUM))
2423	{
2424	  *err = REG_BADRPT;
2425	  return NULL;
2426	}
2427    }
2428
2429  return tree;
2430}
2431
2432/* This function build the following tree, from regular expression
2433   (<reg_exp>):
2434	 SUBEXP
2435	    |
2436	<reg_exp>
2437*/
2438
2439static bin_tree_t *
2440parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2442{
2443  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444  bin_tree_t *tree;
2445  size_t cur_nsub;
2446  cur_nsub = preg->re_nsub++;
2447
2448  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449
2450  /* The subexpression may be a null string.  */
2451  if (token->type == OP_CLOSE_SUBEXP)
2452    tree = NULL;
2453  else
2454    {
2455      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457	{
2458	  if (tree != NULL)
2459	    postorder (tree, free_tree, NULL);
2460	  *err = REG_EPAREN;
2461	}
2462      if (BE (*err != REG_NOERROR, 0))
2463	return NULL;
2464    }
2465
2466  if (cur_nsub <= '9' - '1')
2467    dfa->completed_bkref_map |= 1 << cur_nsub;
2468
2469  tree = create_tree (dfa, tree, NULL, SUBEXP);
2470  if (BE (tree == NULL, 0))
2471    {
2472      *err = REG_ESPACE;
2473      return NULL;
2474    }
2475  tree->token.opr.idx = cur_nsub;
2476  return tree;
2477}
2478
2479/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2480
2481static bin_tree_t *
2482parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2483	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2484{
2485  bin_tree_t *tree = NULL, *old_tree = NULL;
2486  int i, start, end, start_idx = re_string_cur_idx (regexp);
2487  re_token_t start_token = *token;
2488
2489  if (token->type == OP_OPEN_DUP_NUM)
2490    {
2491      end = 0;
2492      start = fetch_number (regexp, token, syntax);
2493      if (start == -1)
2494	{
2495	  if (token->type == CHARACTER && token->opr.c == ',')
2496	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2497	  else
2498	    {
2499	      *err = REG_BADBR; /* <re>{} is invalid.  */
2500	      return NULL;
2501	    }
2502	}
2503      if (BE (start != -2, 1))
2504	{
2505	  /* We treat "{n}" as "{n,n}".  */
2506	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2507		 : ((token->type == CHARACTER && token->opr.c == ',')
2508		    ? fetch_number (regexp, token, syntax) : -2));
2509	}
2510      if (BE (start == -2 || end == -2, 0))
2511	{
2512	  /* Invalid sequence.  */
2513	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2514	    {
2515	      if (token->type == END_OF_RE)
2516		*err = REG_EBRACE;
2517	      else
2518		*err = REG_BADBR;
2519
2520	      return NULL;
2521	    }
2522
2523	  /* If the syntax bit is set, rollback.  */
2524	  re_string_set_index (regexp, start_idx);
2525	  *token = start_token;
2526	  token->type = CHARACTER;
2527	  /* mb_partial and word_char bits should be already initialized by
2528	     peek_token.  */
2529	  return elem;
2530	}
2531
2532      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2533	{
2534	  /* First number greater than second.  */
2535	  *err = REG_BADBR;
2536	  return NULL;
2537	}
2538    }
2539  else
2540    {
2541      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2542      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2543    }
2544
2545  fetch_token (token, regexp, syntax);
2546
2547  if (BE (elem == NULL, 0))
2548    return NULL;
2549  if (BE (start == 0 && end == 0, 0))
2550    {
2551      postorder (elem, free_tree, NULL);
2552      return NULL;
2553    }
2554
2555  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2556  if (BE (start > 0, 0))
2557    {
2558      tree = elem;
2559      for (i = 2; i <= start; ++i)
2560	{
2561	  elem = duplicate_tree (elem, dfa);
2562	  tree = create_tree (dfa, tree, elem, CONCAT);
2563	  if (BE (elem == NULL || tree == NULL, 0))
2564	    goto parse_dup_op_espace;
2565	}
2566
2567      if (start == end)
2568	return tree;
2569
2570      /* Duplicate ELEM before it is marked optional.  */
2571      elem = duplicate_tree (elem, dfa);
2572      old_tree = tree;
2573    }
2574  else
2575    old_tree = NULL;
2576
2577  if (elem->token.type == SUBEXP)
2578    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2579
2580  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2581  if (BE (tree == NULL, 0))
2582    goto parse_dup_op_espace;
2583
2584  /* This loop is actually executed only when end != -1,
2585     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2586     already created the start+1-th copy.  */
2587  for (i = start + 2; i <= end; ++i)
2588    {
2589      elem = duplicate_tree (elem, dfa);
2590      tree = create_tree (dfa, tree, elem, CONCAT);
2591      if (BE (elem == NULL || tree == NULL, 0))
2592	goto parse_dup_op_espace;
2593
2594      tree = create_tree (dfa, tree, NULL, OP_ALT);
2595      if (BE (tree == NULL, 0))
2596	goto parse_dup_op_espace;
2597    }
2598
2599  if (old_tree)
2600    tree = create_tree (dfa, old_tree, tree, CONCAT);
2601
2602  return tree;
2603
2604 parse_dup_op_espace:
2605  *err = REG_ESPACE;
2606  return NULL;
2607}
2608
2609/* Size of the names for collating symbol/equivalence_class/character_class.
2610   I'm not sure, but maybe enough.  */
2611#define BRACKET_NAME_BUF_SIZE 32
2612
2613#ifndef _LIBC
2614  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615     Build the range expression which starts from START_ELEM, and ends
2616     at END_ELEM.  The result are written to MBCSET and SBCSET.
2617     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618     mbcset->range_ends, is a pointer argument sinse we may
2619     update it.  */
2620
2621static reg_errcode_t
2622internal_function
2623# ifdef RE_ENABLE_I18N
2624build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2625		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2626# else /* not RE_ENABLE_I18N */
2627build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2628		 bracket_elem_t *end_elem)
2629# endif /* not RE_ENABLE_I18N */
2630{
2631  unsigned int start_ch, end_ch;
2632  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2633  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2634	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2635	  0))
2636    return REG_ERANGE;
2637
2638  /* We can handle no multi character collating elements without libc
2639     support.  */
2640  if (BE ((start_elem->type == COLL_SYM
2641	   && strlen ((char *) start_elem->opr.name) > 1)
2642	  || (end_elem->type == COLL_SYM
2643	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2644    return REG_ECOLLATE;
2645
2646# ifdef RE_ENABLE_I18N
2647  {
2648    wchar_t wc;
2649    wint_t start_wc;
2650    wint_t end_wc;
2651    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2652
2653    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2654		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2655		   : 0));
2656    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2657	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2658		 : 0));
2659    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2660		? __btowc (start_ch) : start_elem->opr.wch);
2661    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2662	      ? __btowc (end_ch) : end_elem->opr.wch);
2663    if (start_wc == WEOF || end_wc == WEOF)
2664      return REG_ECOLLATE;
2665    cmp_buf[0] = start_wc;
2666    cmp_buf[4] = end_wc;
2667    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2668      return REG_ERANGE;
2669
2670    /* Got valid collation sequence values, add them as a new entry.
2671       However, for !_LIBC we have no collation elements: if the
2672       character set is single byte, the single byte character set
2673       that we build below suffices.  parse_bracket_exp passes
2674       no MBCSET if dfa->mb_cur_max == 1.  */
2675    if (mbcset)
2676      {
2677	/* Check the space of the arrays.  */
2678	if (BE (*range_alloc == mbcset->nranges, 0))
2679	  {
2680	    /* There is not enough space, need realloc.  */
2681	    wchar_t *new_array_start, *new_array_end;
2682	    int new_nranges;
2683
2684	    /* +1 in case of mbcset->nranges is 0.  */
2685	    new_nranges = 2 * mbcset->nranges + 1;
2686	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2687	       are NULL if *range_alloc == 0.  */
2688	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2689					  new_nranges);
2690	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2691					new_nranges);
2692
2693	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2694	      return REG_ESPACE;
2695
2696	    mbcset->range_starts = new_array_start;
2697	    mbcset->range_ends = new_array_end;
2698	    *range_alloc = new_nranges;
2699	  }
2700
2701	mbcset->range_starts[mbcset->nranges] = start_wc;
2702	mbcset->range_ends[mbcset->nranges++] = end_wc;
2703      }
2704
2705    /* Build the table for single byte characters.  */
2706    for (wc = 0; wc < SBC_MAX; ++wc)
2707      {
2708	cmp_buf[2] = wc;
2709	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2710	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2711	  bitset_set (sbcset, wc);
2712      }
2713  }
2714# else /* not RE_ENABLE_I18N */
2715  {
2716    unsigned int ch;
2717    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2718		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719		   : 0));
2720    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2721	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722		 : 0));
2723    if (start_ch > end_ch)
2724      return REG_ERANGE;
2725    /* Build the table for single byte characters.  */
2726    for (ch = 0; ch < SBC_MAX; ++ch)
2727      if (start_ch <= ch  && ch <= end_ch)
2728	bitset_set (sbcset, ch);
2729  }
2730# endif /* not RE_ENABLE_I18N */
2731  return REG_NOERROR;
2732}
2733#endif /* not _LIBC */
2734
2735#ifndef _LIBC
2736/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2737   Build the collating element which is represented by NAME.
2738   The result are written to MBCSET and SBCSET.
2739   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2740   pointer argument since we may update it.  */
2741
2742static reg_errcode_t
2743internal_function
2744# ifdef RE_ENABLE_I18N
2745build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2746			int *coll_sym_alloc, const unsigned char *name)
2747# else /* not RE_ENABLE_I18N */
2748build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2749# endif /* not RE_ENABLE_I18N */
2750{
2751  size_t name_len = strlen ((const char *) name);
2752  if (BE (name_len != 1, 0))
2753    return REG_ECOLLATE;
2754  else
2755    {
2756      bitset_set (sbcset, name[0]);
2757      return REG_NOERROR;
2758    }
2759}
2760#endif /* not _LIBC */
2761
2762/* This function parse bracket expression like "[abc]", "[a-c]",
2763   "[[.a-a.]]" etc.  */
2764
2765static bin_tree_t *
2766parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2767		   reg_syntax_t syntax, reg_errcode_t *err)
2768{
2769#ifdef _LIBC
2770  const unsigned char *collseqmb;
2771  const char *collseqwc;
2772  uint32_t nrules;
2773  int32_t table_size;
2774  const int32_t *symb_table;
2775  const unsigned char *extra;
2776
2777  /* Local function for parse_bracket_exp used in _LIBC environement.
2778     Seek the collating symbol entry correspondings to NAME.
2779     Return the index of the symbol in the SYMB_TABLE,
2780     or -1 if not found.  */
2781
2782  auto inline int32_t
2783  __attribute ((always_inline))
2784  seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2785    {
2786      int32_t elem;
2787
2788      for (elem = 0; elem < table_size; elem++)
2789	if (symb_table[2 * elem] != 0)
2790	  {
2791	    int32_t idx = symb_table[2 * elem + 1];
2792	    /* Skip the name of collating element name.  */
2793	    idx += 1 + extra[idx];
2794	    if (/* Compare the length of the name.  */
2795		name_len == extra[idx]
2796		/* Compare the name.  */
2797		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2798	      /* Yep, this is the entry.  */
2799	      return elem;
2800	  }
2801      return -1;
2802    }
2803
2804  /* Local function for parse_bracket_exp used in _LIBC environment.
2805     Look up the collation sequence value of BR_ELEM.
2806     Return the value if succeeded, UINT_MAX otherwise.  */
2807
2808  auto inline unsigned int
2809  __attribute ((always_inline))
2810  lookup_collation_sequence_value (bracket_elem_t *br_elem)
2811    {
2812      if (br_elem->type == SB_CHAR)
2813	{
2814	  /*
2815	  if (MB_CUR_MAX == 1)
2816	  */
2817	  if (nrules == 0)
2818	    return collseqmb[br_elem->opr.ch];
2819	  else
2820	    {
2821	      wint_t wc = __btowc (br_elem->opr.ch);
2822	      return __collseq_table_lookup (collseqwc, wc);
2823	    }
2824	}
2825      else if (br_elem->type == MB_CHAR)
2826	{
2827	  if (nrules != 0)
2828	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2829	}
2830      else if (br_elem->type == COLL_SYM)
2831	{
2832	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2833	  if (nrules != 0)
2834	    {
2835	      int32_t elem, idx;
2836	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2837						  sym_name_len);
2838	      if (elem != -1)
2839		{
2840		  /* We found the entry.  */
2841		  idx = symb_table[2 * elem + 1];
2842		  /* Skip the name of collating element name.  */
2843		  idx += 1 + extra[idx];
2844		  /* Skip the byte sequence of the collating element.  */
2845		  idx += 1 + extra[idx];
2846		  /* Adjust for the alignment.  */
2847		  idx = (idx + 3) & ~3;
2848		  /* Skip the multibyte collation sequence value.  */
2849		  idx += sizeof (unsigned int);
2850		  /* Skip the wide char sequence of the collating element.  */
2851		  idx += sizeof (unsigned int) *
2852		    (1 + *(unsigned int *) (extra + idx));
2853		  /* Return the collation sequence value.  */
2854		  return *(unsigned int *) (extra + idx);
2855		}
2856	      else if (sym_name_len == 1)
2857		{
2858		  /* No valid character.  Match it as a single byte
2859		     character.  */
2860		  return collseqmb[br_elem->opr.name[0]];
2861		}
2862	    }
2863	  else if (sym_name_len == 1)
2864	    return collseqmb[br_elem->opr.name[0]];
2865	}
2866      return UINT_MAX;
2867    }
2868
2869  /* Local function for parse_bracket_exp used in _LIBC environement.
2870     Build the range expression which starts from START_ELEM, and ends
2871     at END_ELEM.  The result are written to MBCSET and SBCSET.
2872     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2873     mbcset->range_ends, is a pointer argument sinse we may
2874     update it.  */
2875
2876  auto inline reg_errcode_t
2877  __attribute ((always_inline))
2878  build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2879		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2880    {
2881      unsigned int ch;
2882      uint32_t start_collseq;
2883      uint32_t end_collseq;
2884
2885      /* Equivalence Classes and Character Classes can't be a range
2886	 start/end.  */
2887      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2888	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2889	      0))
2890	return REG_ERANGE;
2891
2892      start_collseq = lookup_collation_sequence_value (start_elem);
2893      end_collseq = lookup_collation_sequence_value (end_elem);
2894      /* Check start/end collation sequence values.  */
2895      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2896	return REG_ECOLLATE;
2897      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2898	return REG_ERANGE;
2899
2900      /* Got valid collation sequence values, add them as a new entry.
2901	 However, if we have no collation elements, and the character set
2902	 is single byte, the single byte character set that we
2903	 build below suffices. */
2904      if (nrules > 0 || dfa->mb_cur_max > 1)
2905	{
2906	  /* Check the space of the arrays.  */
2907	  if (BE (*range_alloc == mbcset->nranges, 0))
2908	    {
2909	      /* There is not enough space, need realloc.  */
2910	      uint32_t *new_array_start;
2911	      uint32_t *new_array_end;
2912	      int new_nranges;
2913
2914	      /* +1 in case of mbcset->nranges is 0.  */
2915	      new_nranges = 2 * mbcset->nranges + 1;
2916	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2917					    new_nranges);
2918	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2919					  new_nranges);
2920
2921	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2922		return REG_ESPACE;
2923
2924	      mbcset->range_starts = new_array_start;
2925	      mbcset->range_ends = new_array_end;
2926	      *range_alloc = new_nranges;
2927	    }
2928
2929	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2930	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2931	}
2932
2933      /* Build the table for single byte characters.  */
2934      for (ch = 0; ch < SBC_MAX; ch++)
2935	{
2936	  uint32_t ch_collseq;
2937	  /*
2938	  if (MB_CUR_MAX == 1)
2939	  */
2940	  if (nrules == 0)
2941	    ch_collseq = collseqmb[ch];
2942	  else
2943	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2944	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2945	    bitset_set (sbcset, ch);
2946	}
2947      return REG_NOERROR;
2948    }
2949
2950  /* Local function for parse_bracket_exp used in _LIBC environement.
2951     Build the collating element which is represented by NAME.
2952     The result are written to MBCSET and SBCSET.
2953     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2954     pointer argument sinse we may update it.  */
2955
2956  auto inline reg_errcode_t
2957  __attribute ((always_inline))
2958  build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2959			  int *coll_sym_alloc, const unsigned char *name)
2960    {
2961      int32_t elem, idx;
2962      size_t name_len = strlen ((const char *) name);
2963      if (nrules != 0)
2964	{
2965	  elem = seek_collating_symbol_entry (name, name_len);
2966	  if (elem != -1)
2967	    {
2968	      /* We found the entry.  */
2969	      idx = symb_table[2 * elem + 1];
2970	      /* Skip the name of collating element name.  */
2971	      idx += 1 + extra[idx];
2972	    }
2973	  else if (name_len == 1)
2974	    {
2975	      /* No valid character, treat it as a normal
2976		 character.  */
2977	      bitset_set (sbcset, name[0]);
2978	      return REG_NOERROR;
2979	    }
2980	  else
2981	    return REG_ECOLLATE;
2982
2983	  /* Got valid collation sequence, add it as a new entry.  */
2984	  /* Check the space of the arrays.  */
2985	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2986	    {
2987	      /* Not enough, realloc it.  */
2988	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2989	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2990	      /* Use realloc since mbcset->coll_syms is NULL
2991		 if *alloc == 0.  */
2992	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2993						   new_coll_sym_alloc);
2994	      if (BE (new_coll_syms == NULL, 0))
2995		return REG_ESPACE;
2996	      mbcset->coll_syms = new_coll_syms;
2997	      *coll_sym_alloc = new_coll_sym_alloc;
2998	    }
2999	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3000	  return REG_NOERROR;
3001	}
3002      else
3003	{
3004	  if (BE (name_len != 1, 0))
3005	    return REG_ECOLLATE;
3006	  else
3007	    {
3008	      bitset_set (sbcset, name[0]);
3009	      return REG_NOERROR;
3010	    }
3011	}
3012    }
3013#endif
3014
3015  re_token_t br_token;
3016  re_bitset_ptr_t sbcset;
3017#ifdef RE_ENABLE_I18N
3018  re_charset_t *mbcset;
3019  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3020  int equiv_class_alloc = 0, char_class_alloc = 0;
3021#endif /* not RE_ENABLE_I18N */
3022  int non_match = 0;
3023  bin_tree_t *work_tree;
3024  int token_len;
3025  int first_round = 1;
3026#ifdef _LIBC
3027  collseqmb = (const unsigned char *)
3028    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3029  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3030  if (nrules)
3031    {
3032      /*
3033      if (MB_CUR_MAX > 1)
3034      */
3035      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3036      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3037      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3038						  _NL_COLLATE_SYMB_TABLEMB);
3039      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3040						   _NL_COLLATE_SYMB_EXTRAMB);
3041    }
3042#endif
3043  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3044#ifdef RE_ENABLE_I18N
3045  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3046#endif /* RE_ENABLE_I18N */
3047#ifdef RE_ENABLE_I18N
3048  if (BE (sbcset == NULL || mbcset == NULL, 0))
3049#else
3050  if (BE (sbcset == NULL, 0))
3051#endif /* RE_ENABLE_I18N */
3052    {
3053      re_free (sbcset);
3054#ifdef RE_ENABLE_I18N
3055      re_free (mbcset);
3056#endif
3057      *err = REG_ESPACE;
3058      return NULL;
3059    }
3060
3061  token_len = peek_token_bracket (token, regexp, syntax);
3062  if (BE (token->type == END_OF_RE, 0))
3063    {
3064      *err = REG_BADPAT;
3065      goto parse_bracket_exp_free_return;
3066    }
3067  if (token->type == OP_NON_MATCH_LIST)
3068    {
3069#ifdef RE_ENABLE_I18N
3070      mbcset->non_match = 1;
3071#endif /* not RE_ENABLE_I18N */
3072      non_match = 1;
3073      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3074	bitset_set (sbcset, '\n');
3075      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3076      token_len = peek_token_bracket (token, regexp, syntax);
3077      if (BE (token->type == END_OF_RE, 0))
3078	{
3079	  *err = REG_BADPAT;
3080	  goto parse_bracket_exp_free_return;
3081	}
3082    }
3083
3084  /* We treat the first ']' as a normal character.  */
3085  if (token->type == OP_CLOSE_BRACKET)
3086    token->type = CHARACTER;
3087
3088  while (1)
3089    {
3090      bracket_elem_t start_elem, end_elem;
3091      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3092      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3093      reg_errcode_t ret;
3094      int token_len2 = 0, is_range_exp = 0;
3095      re_token_t token2;
3096
3097      start_elem.opr.name = start_name_buf;
3098      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3099				   syntax, first_round);
3100      if (BE (ret != REG_NOERROR, 0))
3101	{
3102	  *err = ret;
3103	  goto parse_bracket_exp_free_return;
3104	}
3105      first_round = 0;
3106
3107      /* Get information about the next token.  We need it in any case.  */
3108      token_len = peek_token_bracket (token, regexp, syntax);
3109
3110      /* Do not check for ranges if we know they are not allowed.  */
3111      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3112	{
3113	  if (BE (token->type == END_OF_RE, 0))
3114	    {
3115	      *err = REG_EBRACK;
3116	      goto parse_bracket_exp_free_return;
3117	    }
3118	  if (token->type == OP_CHARSET_RANGE)
3119	    {
3120	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3121	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3122	      if (BE (token2.type == END_OF_RE, 0))
3123		{
3124		  *err = REG_EBRACK;
3125		  goto parse_bracket_exp_free_return;
3126		}
3127	      if (token2.type == OP_CLOSE_BRACKET)
3128		{
3129		  /* We treat the last '-' as a normal character.  */
3130		  re_string_skip_bytes (regexp, -token_len);
3131		  token->type = CHARACTER;
3132		}
3133	      else
3134		is_range_exp = 1;
3135	    }
3136	}
3137
3138      if (is_range_exp == 1)
3139	{
3140	  end_elem.opr.name = end_name_buf;
3141	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3142				       dfa, syntax, 1);
3143	  if (BE (ret != REG_NOERROR, 0))
3144	    {
3145	      *err = ret;
3146	      goto parse_bracket_exp_free_return;
3147	    }
3148
3149	  token_len = peek_token_bracket (token, regexp, syntax);
3150
3151#ifdef _LIBC
3152	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3153				  &start_elem, &end_elem);
3154#else
3155# ifdef RE_ENABLE_I18N
3156	  *err = build_range_exp (sbcset,
3157				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3158				  &range_alloc, &start_elem, &end_elem);
3159# else
3160	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3161# endif
3162#endif /* RE_ENABLE_I18N */
3163	  if (BE (*err != REG_NOERROR, 0))
3164	    goto parse_bracket_exp_free_return;
3165	}
3166      else
3167	{
3168	  switch (start_elem.type)
3169	    {
3170	    case SB_CHAR:
3171	      bitset_set (sbcset, start_elem.opr.ch);
3172	      break;
3173#ifdef RE_ENABLE_I18N
3174	    case MB_CHAR:
3175	      /* Check whether the array has enough space.  */
3176	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3177		{
3178		  wchar_t *new_mbchars;
3179		  /* Not enough, realloc it.  */
3180		  /* +1 in case of mbcset->nmbchars is 0.  */
3181		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3182		  /* Use realloc since array is NULL if *alloc == 0.  */
3183		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3184					    mbchar_alloc);
3185		  if (BE (new_mbchars == NULL, 0))
3186		    goto parse_bracket_exp_espace;
3187		  mbcset->mbchars = new_mbchars;
3188		}
3189	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3190	      break;
3191#endif /* RE_ENABLE_I18N */
3192	    case EQUIV_CLASS:
3193	      *err = build_equiv_class (sbcset,
3194#ifdef RE_ENABLE_I18N
3195					mbcset, &equiv_class_alloc,
3196#endif /* RE_ENABLE_I18N */
3197					start_elem.opr.name);
3198	      if (BE (*err != REG_NOERROR, 0))
3199		goto parse_bracket_exp_free_return;
3200	      break;
3201	    case COLL_SYM:
3202	      *err = build_collating_symbol (sbcset,
3203#ifdef RE_ENABLE_I18N
3204					     mbcset, &coll_sym_alloc,
3205#endif /* RE_ENABLE_I18N */
3206					     start_elem.opr.name);
3207	      if (BE (*err != REG_NOERROR, 0))
3208		goto parse_bracket_exp_free_return;
3209	      break;
3210	    case CHAR_CLASS:
3211	      *err = build_charclass (regexp->trans, sbcset,
3212#ifdef RE_ENABLE_I18N
3213				      mbcset, &char_class_alloc,
3214#endif /* RE_ENABLE_I18N */
3215				      start_elem.opr.name, syntax);
3216	      if (BE (*err != REG_NOERROR, 0))
3217	       goto parse_bracket_exp_free_return;
3218	      break;
3219	    default:
3220	      assert (0);
3221	      break;
3222	    }
3223	}
3224      if (BE (token->type == END_OF_RE, 0))
3225	{
3226	  *err = REG_EBRACK;
3227	  goto parse_bracket_exp_free_return;
3228	}
3229      if (token->type == OP_CLOSE_BRACKET)
3230	break;
3231    }
3232
3233  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3234
3235  /* If it is non-matching list.  */
3236  if (non_match)
3237    bitset_not (sbcset);
3238
3239#ifdef RE_ENABLE_I18N
3240  /* Ensure only single byte characters are set.  */
3241  if (dfa->mb_cur_max > 1)
3242    bitset_mask (sbcset, dfa->sb_char);
3243
3244  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3245      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3246						     || mbcset->non_match)))
3247    {
3248      bin_tree_t *mbc_tree;
3249      int sbc_idx;
3250      /* Build a tree for complex bracket.  */
3251      dfa->has_mb_node = 1;
3252      br_token.type = COMPLEX_BRACKET;
3253      br_token.opr.mbcset = mbcset;
3254      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3255      if (BE (mbc_tree == NULL, 0))
3256	goto parse_bracket_exp_espace;
3257      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3258	if (sbcset[sbc_idx])
3259	  break;
3260      /* If there are no bits set in sbcset, there is no point
3261	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3262      if (sbc_idx < BITSET_WORDS)
3263	{
3264	  /* Build a tree for simple bracket.  */
3265	  br_token.type = SIMPLE_BRACKET;
3266	  br_token.opr.sbcset = sbcset;
3267	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3268	  if (BE (work_tree == NULL, 0))
3269	    goto parse_bracket_exp_espace;
3270
3271	  /* Then join them by ALT node.  */
3272	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3273	  if (BE (work_tree == NULL, 0))
3274	    goto parse_bracket_exp_espace;
3275	}
3276      else
3277	{
3278	  re_free (sbcset);
3279	  work_tree = mbc_tree;
3280	}
3281    }
3282  else
3283#endif /* not RE_ENABLE_I18N */
3284    {
3285#ifdef RE_ENABLE_I18N
3286      free_charset (mbcset);
3287#endif
3288      /* Build a tree for simple bracket.  */
3289      br_token.type = SIMPLE_BRACKET;
3290      br_token.opr.sbcset = sbcset;
3291      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292      if (BE (work_tree == NULL, 0))
3293	goto parse_bracket_exp_espace;
3294    }
3295  return work_tree;
3296
3297 parse_bracket_exp_espace:
3298  *err = REG_ESPACE;
3299 parse_bracket_exp_free_return:
3300  re_free (sbcset);
3301#ifdef RE_ENABLE_I18N
3302  free_charset (mbcset);
3303#endif /* RE_ENABLE_I18N */
3304  return NULL;
3305}
3306
3307/* Parse an element in the bracket expression.  */
3308
3309static reg_errcode_t
3310parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3311		       re_token_t *token, int token_len, re_dfa_t *dfa,
3312		       reg_syntax_t syntax, int accept_hyphen)
3313{
3314#ifdef RE_ENABLE_I18N
3315  int cur_char_size;
3316  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3317  if (cur_char_size > 1)
3318    {
3319      elem->type = MB_CHAR;
3320      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3321      re_string_skip_bytes (regexp, cur_char_size);
3322      return REG_NOERROR;
3323    }
3324#endif /* RE_ENABLE_I18N */
3325  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3326  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3327      || token->type == OP_OPEN_EQUIV_CLASS)
3328    return parse_bracket_symbol (elem, regexp, token);
3329  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3330    {
3331      /* A '-' must only appear as anything but a range indicator before
3332	 the closing bracket.  Everything else is an error.  */
3333      re_token_t token2;
3334      (void) peek_token_bracket (&token2, regexp, syntax);
3335      if (token2.type != OP_CLOSE_BRACKET)
3336	/* The actual error value is not standardized since this whole
3337	   case is undefined.  But ERANGE makes good sense.  */
3338	return REG_ERANGE;
3339    }
3340  elem->type = SB_CHAR;
3341  elem->opr.ch = token->opr.c;
3342  return REG_NOERROR;
3343}
3344
3345/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3346   such as [:<character_class>:], [.<collating_element>.], and
3347   [=<equivalent_class>=].  */
3348
3349static reg_errcode_t
3350parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3351		      re_token_t *token)
3352{
3353  unsigned char ch, delim = token->opr.c;
3354  int i = 0;
3355  if (re_string_eoi(regexp))
3356    return REG_EBRACK;
3357  for (;; ++i)
3358    {
3359      if (i >= BRACKET_NAME_BUF_SIZE)
3360	return REG_EBRACK;
3361      if (token->type == OP_OPEN_CHAR_CLASS)
3362	ch = re_string_fetch_byte_case (regexp);
3363      else
3364	ch = re_string_fetch_byte (regexp);
3365      if (re_string_eoi(regexp))
3366	return REG_EBRACK;
3367      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3368	break;
3369      elem->opr.name[i] = ch;
3370    }
3371  re_string_skip_bytes (regexp, 1);
3372  elem->opr.name[i] = '\0';
3373  switch (token->type)
3374    {
3375    case OP_OPEN_COLL_ELEM:
3376      elem->type = COLL_SYM;
3377      break;
3378    case OP_OPEN_EQUIV_CLASS:
3379      elem->type = EQUIV_CLASS;
3380      break;
3381    case OP_OPEN_CHAR_CLASS:
3382      elem->type = CHAR_CLASS;
3383      break;
3384    default:
3385      break;
3386    }
3387  return REG_NOERROR;
3388}
3389
3390  /* Helper function for parse_bracket_exp.
3391     Build the equivalence class which is represented by NAME.
3392     The result are written to MBCSET and SBCSET.
3393     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3394     is a pointer argument sinse we may update it.  */
3395
3396static reg_errcode_t
3397#ifdef RE_ENABLE_I18N
3398build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3399		   int *equiv_class_alloc, const unsigned char *name)
3400#else /* not RE_ENABLE_I18N */
3401build_equiv_class (bitset_t sbcset, const unsigned char *name)
3402#endif /* not RE_ENABLE_I18N */
3403{
3404#ifdef _LIBC
3405  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3406  if (nrules != 0)
3407    {
3408      const int32_t *table, *indirect;
3409      const unsigned char *weights, *extra, *cp;
3410      unsigned char char_buf[2];
3411      int32_t idx1, idx2;
3412      unsigned int ch;
3413      size_t len;
3414      /* This #include defines a local function!  */
3415# include <locale/weight.h>
3416      /* Calculate the index for equivalence class.  */
3417      cp = name;
3418      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3419      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3420					       _NL_COLLATE_WEIGHTMB);
3421      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422						   _NL_COLLATE_EXTRAMB);
3423      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3424						_NL_COLLATE_INDIRECTMB);
3425      idx1 = findidx (&cp, -1);
3426      if (BE (idx1 == 0 || *cp != '\0', 0))
3427	/* This isn't a valid character.  */
3428	return REG_ECOLLATE;
3429
3430      /* Build single byte matcing table for this equivalence class.  */
3431      len = weights[idx1 & 0xffffff];
3432      for (ch = 0; ch < SBC_MAX; ++ch)
3433	{
3434	  char_buf[0] = ch;
3435	  cp = char_buf;
3436	  idx2 = findidx (&cp, 1);
3437/*
3438	  idx2 = table[ch];
3439*/
3440	  if (idx2 == 0)
3441	    /* This isn't a valid character.  */
3442	    continue;
3443	  /* Compare only if the length matches and the collation rule
3444	     index is the same.  */
3445	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3446	    {
3447	      int cnt = 0;
3448
3449	      while (cnt <= len &&
3450		     weights[(idx1 & 0xffffff) + 1 + cnt]
3451		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3452		++cnt;
3453
3454	      if (cnt > len)
3455		bitset_set (sbcset, ch);
3456	    }
3457	}
3458      /* Check whether the array has enough space.  */
3459      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3460	{
3461	  /* Not enough, realloc it.  */
3462	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3463	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3464	  /* Use realloc since the array is NULL if *alloc == 0.  */
3465	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3466						   int32_t,
3467						   new_equiv_class_alloc);
3468	  if (BE (new_equiv_classes == NULL, 0))
3469	    return REG_ESPACE;
3470	  mbcset->equiv_classes = new_equiv_classes;
3471	  *equiv_class_alloc = new_equiv_class_alloc;
3472	}
3473      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3474    }
3475  else
3476#endif /* _LIBC */
3477    {
3478      if (BE (strlen ((const char *) name) != 1, 0))
3479	return REG_ECOLLATE;
3480      bitset_set (sbcset, *name);
3481    }
3482  return REG_NOERROR;
3483}
3484
3485  /* Helper function for parse_bracket_exp.
3486     Build the character class which is represented by NAME.
3487     The result are written to MBCSET and SBCSET.
3488     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3489     is a pointer argument sinse we may update it.  */
3490
3491static reg_errcode_t
3492#ifdef RE_ENABLE_I18N
3493build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3494		 re_charset_t *mbcset, int *char_class_alloc,
3495		 const unsigned char *class_name, reg_syntax_t syntax)
3496#else /* not RE_ENABLE_I18N */
3497build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3498		 const unsigned char *class_name, reg_syntax_t syntax)
3499#endif /* not RE_ENABLE_I18N */
3500{
3501  int i;
3502  const char *name = (const char *) class_name;
3503
3504  /* In case of REG_ICASE "upper" and "lower" match the both of
3505     upper and lower cases.  */
3506  if ((syntax & RE_ICASE)
3507      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3508    name = "alpha";
3509
3510#ifdef RE_ENABLE_I18N
3511  /* Check the space of the arrays.  */
3512  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3513    {
3514      /* Not enough, realloc it.  */
3515      /* +1 in case of mbcset->nchar_classes is 0.  */
3516      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3517      /* Use realloc since array is NULL if *alloc == 0.  */
3518      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3519					       new_char_class_alloc);
3520      if (BE (new_char_classes == NULL, 0))
3521	return REG_ESPACE;
3522      mbcset->char_classes = new_char_classes;
3523      *char_class_alloc = new_char_class_alloc;
3524    }
3525  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3526#endif /* RE_ENABLE_I18N */
3527
3528#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3529  do {						\
3530    if (BE (trans != NULL, 0))			\
3531      {						\
3532	for (i = 0; i < SBC_MAX; ++i)		\
3533  	  if (ctype_func (i))			\
3534	    bitset_set (sbcset, trans[i]);	\
3535      }						\
3536    else					\
3537      {						\
3538	for (i = 0; i < SBC_MAX; ++i)		\
3539  	  if (ctype_func (i))			\
3540	    bitset_set (sbcset, i);		\
3541      }						\
3542  } while (0)
3543
3544  if (strcmp (name, "alnum") == 0)
3545    BUILD_CHARCLASS_LOOP (isalnum);
3546  else if (strcmp (name, "cntrl") == 0)
3547    BUILD_CHARCLASS_LOOP (iscntrl);
3548  else if (strcmp (name, "lower") == 0)
3549    BUILD_CHARCLASS_LOOP (islower);
3550  else if (strcmp (name, "space") == 0)
3551    BUILD_CHARCLASS_LOOP (isspace);
3552  else if (strcmp (name, "alpha") == 0)
3553    BUILD_CHARCLASS_LOOP (isalpha);
3554  else if (strcmp (name, "digit") == 0)
3555    BUILD_CHARCLASS_LOOP (isdigit);
3556  else if (strcmp (name, "print") == 0)
3557    BUILD_CHARCLASS_LOOP (isprint);
3558  else if (strcmp (name, "upper") == 0)
3559    BUILD_CHARCLASS_LOOP (isupper);
3560  else if (strcmp (name, "blank") == 0)
3561    BUILD_CHARCLASS_LOOP (isblank);
3562  else if (strcmp (name, "graph") == 0)
3563    BUILD_CHARCLASS_LOOP (isgraph);
3564  else if (strcmp (name, "punct") == 0)
3565    BUILD_CHARCLASS_LOOP (ispunct);
3566  else if (strcmp (name, "xdigit") == 0)
3567    BUILD_CHARCLASS_LOOP (isxdigit);
3568  else
3569    return REG_ECTYPE;
3570
3571  return REG_NOERROR;
3572}
3573
3574static bin_tree_t *
3575build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3576		    const unsigned char *class_name,
3577		    const unsigned char *extra, int non_match,
3578		    reg_errcode_t *err)
3579{
3580  re_bitset_ptr_t sbcset;
3581#ifdef RE_ENABLE_I18N
3582  re_charset_t *mbcset;
3583  int alloc = 0;
3584#endif /* not RE_ENABLE_I18N */
3585  reg_errcode_t ret;
3586  re_token_t br_token;
3587  bin_tree_t *tree;
3588
3589  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3590#ifdef RE_ENABLE_I18N
3591  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3592#endif /* RE_ENABLE_I18N */
3593
3594#ifdef RE_ENABLE_I18N
3595  if (BE (sbcset == NULL || mbcset == NULL, 0))
3596#else /* not RE_ENABLE_I18N */
3597  if (BE (sbcset == NULL, 0))
3598#endif /* not RE_ENABLE_I18N */
3599    {
3600      *err = REG_ESPACE;
3601      return NULL;
3602    }
3603
3604  if (non_match)
3605    {
3606#ifdef RE_ENABLE_I18N
3607      mbcset->non_match = 1;
3608#endif /* not RE_ENABLE_I18N */
3609    }
3610
3611  /* We don't care the syntax in this case.  */
3612  ret = build_charclass (trans, sbcset,
3613#ifdef RE_ENABLE_I18N
3614			 mbcset, &alloc,
3615#endif /* RE_ENABLE_I18N */
3616			 class_name, 0);
3617
3618  if (BE (ret != REG_NOERROR, 0))
3619    {
3620      re_free (sbcset);
3621#ifdef RE_ENABLE_I18N
3622      free_charset (mbcset);
3623#endif /* RE_ENABLE_I18N */
3624      *err = ret;
3625      return NULL;
3626    }
3627  /* \w match '_' also.  */
3628  for (; *extra; extra++)
3629    bitset_set (sbcset, *extra);
3630
3631  /* If it is non-matching list.  */
3632  if (non_match)
3633    bitset_not (sbcset);
3634
3635#ifdef RE_ENABLE_I18N
3636  /* Ensure only single byte characters are set.  */
3637  if (dfa->mb_cur_max > 1)
3638    bitset_mask (sbcset, dfa->sb_char);
3639#endif
3640
3641  /* Build a tree for simple bracket.  */
3642  br_token.type = SIMPLE_BRACKET;
3643  br_token.opr.sbcset = sbcset;
3644  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3645  if (BE (tree == NULL, 0))
3646    goto build_word_op_espace;
3647
3648#ifdef RE_ENABLE_I18N
3649  if (dfa->mb_cur_max > 1)
3650    {
3651      bin_tree_t *mbc_tree;
3652      /* Build a tree for complex bracket.  */
3653      br_token.type = COMPLEX_BRACKET;
3654      br_token.opr.mbcset = mbcset;
3655      dfa->has_mb_node = 1;
3656      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3657      if (BE (mbc_tree == NULL, 0))
3658	goto build_word_op_espace;
3659      /* Then join them by ALT node.  */
3660      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3661      if (BE (mbc_tree != NULL, 1))
3662	return tree;
3663    }
3664  else
3665    {
3666      free_charset (mbcset);
3667      return tree;
3668    }
3669#else /* not RE_ENABLE_I18N */
3670  return tree;
3671#endif /* not RE_ENABLE_I18N */
3672
3673 build_word_op_espace:
3674  re_free (sbcset);
3675#ifdef RE_ENABLE_I18N
3676  free_charset (mbcset);
3677#endif /* RE_ENABLE_I18N */
3678  *err = REG_ESPACE;
3679  return NULL;
3680}
3681
3682/* This is intended for the expressions like "a{1,3}".
3683   Fetch a number from `input', and return the number.
3684   Return -1, if the number field is empty like "{,1}".
3685   Return -2, If an error is occured.  */
3686
3687static int
3688fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3689{
3690  int num = -1;
3691  unsigned char c;
3692  while (1)
3693    {
3694      fetch_token (token, input, syntax);
3695      c = token->opr.c;
3696      if (BE (token->type == END_OF_RE, 0))
3697	return -2;
3698      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3699	break;
3700      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3701	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3702      num = (num > RE_DUP_MAX) ? -2 : num;
3703    }
3704  return num;
3705}
3706
3707#ifdef RE_ENABLE_I18N
3708static void
3709free_charset (re_charset_t *cset)
3710{
3711  re_free (cset->mbchars);
3712# ifdef _LIBC
3713  re_free (cset->coll_syms);
3714  re_free (cset->equiv_classes);
3715  re_free (cset->range_starts);
3716  re_free (cset->range_ends);
3717# endif
3718  re_free (cset->char_classes);
3719  re_free (cset);
3720}
3721#endif /* RE_ENABLE_I18N */
3722
3723/* Functions for binary tree operation.  */
3724
3725/* Create a tree node.  */
3726
3727static bin_tree_t *
3728create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3729	     re_token_type_t type)
3730{
3731  re_token_t t;
3732  t.type = type;
3733  return create_token_tree (dfa, left, right, &t);
3734}
3735
3736static bin_tree_t *
3737create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3738		   const re_token_t *token)
3739{
3740  bin_tree_t *tree;
3741  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3742    {
3743      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3744
3745      if (storage == NULL)
3746	return NULL;
3747      storage->next = dfa->str_tree_storage;
3748      dfa->str_tree_storage = storage;
3749      dfa->str_tree_storage_idx = 0;
3750    }
3751  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3752
3753  tree->parent = NULL;
3754  tree->left = left;
3755  tree->right = right;
3756  tree->token = *token;
3757  tree->token.duplicated = 0;
3758  tree->token.opt_subexp = 0;
3759  tree->first = NULL;
3760  tree->next = NULL;
3761  tree->node_idx = -1;
3762
3763  if (left != NULL)
3764    left->parent = tree;
3765  if (right != NULL)
3766    right->parent = tree;
3767  return tree;
3768}
3769
3770/* Mark the tree SRC as an optional subexpression.
3771   To be called from preorder or postorder.  */
3772
3773static reg_errcode_t
3774mark_opt_subexp (void *extra, bin_tree_t *node)
3775{
3776  int idx = (int) (long) extra;
3777  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3778    node->token.opt_subexp = 1;
3779
3780  return REG_NOERROR;
3781}
3782
3783/* Free the allocated memory inside NODE. */
3784
3785static void
3786free_token (re_token_t *node)
3787{
3788#ifdef RE_ENABLE_I18N
3789  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3790    free_charset (node->opr.mbcset);
3791  else
3792#endif /* RE_ENABLE_I18N */
3793    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3794      re_free (node->opr.sbcset);
3795}
3796
3797/* Worker function for tree walking.  Free the allocated memory inside NODE
3798   and its children. */
3799
3800static reg_errcode_t
3801free_tree (void *extra, bin_tree_t *node)
3802{
3803  free_token (&node->token);
3804  return REG_NOERROR;
3805}
3806
3807
3808/* Duplicate the node SRC, and return new node.  This is a preorder
3809   visit similar to the one implemented by the generic visitor, but
3810   we need more infrastructure to maintain two parallel trees --- so,
3811   it's easier to duplicate.  */
3812
3813static bin_tree_t *
3814duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3815{
3816  const bin_tree_t *node;
3817  bin_tree_t *dup_root;
3818  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3819
3820  for (node = root; ; )
3821    {
3822      /* Create a new tree and link it back to the current parent.  */
3823      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3824      if (*p_new == NULL)
3825	return NULL;
3826      (*p_new)->parent = dup_node;
3827      (*p_new)->token.duplicated = 1;
3828      dup_node = *p_new;
3829
3830      /* Go to the left node, or up and to the right.  */
3831      if (node->left)
3832	{
3833	  node = node->left;
3834	  p_new = &dup_node->left;
3835	}
3836      else
3837	{
3838	  const bin_tree_t *prev = NULL;
3839	  while (node->right == prev || node->right == NULL)
3840	    {
3841	      prev = node;
3842	      node = node->parent;
3843	      dup_node = dup_node->parent;
3844	      if (!node)
3845		return dup_root;
3846	    }
3847	  node = node->right;
3848	  p_new = &dup_node->right;
3849	}
3850    }
3851}
3852