1/* Extended regular expression matching and search library.
2   Copyright (C) 2002,2003,2004,2005,2006,2007,2009
3   Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6
7   The GNU C Library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 2.1 of the License, or (at your option) any later version.
11
12   The GNU C Library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   Lesser General Public License for more details.
16
17   You should have received a copy of the GNU Lesser General Public
18   License along with the GNU C Library; if not, write to the Free
19   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20   02111-1307 USA.  */
21
22static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
23					  size_t length, reg_syntax_t syntax);
24static void re_compile_fastmap_iter (regex_t *bufp,
25				     const re_dfastate_t *init_state,
26				     char *fastmap);
27static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28#ifdef RE_ENABLE_I18N
29static void free_charset (re_charset_t *cset);
30#endif /* RE_ENABLE_I18N */
31static void free_workarea_compile (regex_t *preg);
32static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33#ifdef RE_ENABLE_I18N
34static void optimize_utf8 (re_dfa_t *dfa);
35#endif
36static reg_errcode_t analyze (regex_t *preg);
37static reg_errcode_t preorder (bin_tree_t *root,
38			       reg_errcode_t (fn (void *, bin_tree_t *)),
39			       void *extra);
40static reg_errcode_t postorder (bin_tree_t *root,
41				reg_errcode_t (fn (void *, bin_tree_t *)),
42				void *extra);
43static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46				 bin_tree_t *node);
47static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
51static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
52				   unsigned int constraint);
53static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
54static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55					 int node, int root);
56static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
57static int fetch_number (re_string_t *input, re_token_t *token,
58			 reg_syntax_t syntax);
59static int peek_token (re_token_t *token, re_string_t *input,
60			reg_syntax_t syntax) internal_function;
61static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
62			  reg_syntax_t syntax, reg_errcode_t *err);
63static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
64				  re_token_t *token, reg_syntax_t syntax,
65				  int nest, reg_errcode_t *err);
66static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
67				 re_token_t *token, reg_syntax_t syntax,
68				 int nest, reg_errcode_t *err);
69static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
70				     re_token_t *token, reg_syntax_t syntax,
71				     int nest, reg_errcode_t *err);
72static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
73				  re_token_t *token, reg_syntax_t syntax,
74				  int nest, reg_errcode_t *err);
75static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
76				 re_dfa_t *dfa, re_token_t *token,
77				 reg_syntax_t syntax, reg_errcode_t *err);
78static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
79				      re_token_t *token, reg_syntax_t syntax,
80				      reg_errcode_t *err);
81static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82					    re_string_t *regexp,
83					    re_token_t *token, int token_len,
84					    re_dfa_t *dfa,
85					    reg_syntax_t syntax,
86					    int accept_hyphen);
87static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
88					  re_string_t *regexp,
89					  re_token_t *token);
90#ifdef RE_ENABLE_I18N
91static reg_errcode_t build_equiv_class (bitset_t sbcset,
92					re_charset_t *mbcset,
93					int *equiv_class_alloc,
94					const unsigned char *name);
95static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96				      bitset_t sbcset,
97				      re_charset_t *mbcset,
98				      int *char_class_alloc,
99				      const unsigned char *class_name,
100				      reg_syntax_t syntax);
101#else  /* not RE_ENABLE_I18N */
102static reg_errcode_t build_equiv_class (bitset_t sbcset,
103					const unsigned char *name);
104static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105				      bitset_t sbcset,
106				      const unsigned char *class_name,
107				      reg_syntax_t syntax);
108#endif /* not RE_ENABLE_I18N */
109static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
110				       RE_TRANSLATE_TYPE trans,
111				       const unsigned char *class_name,
112				       const unsigned char *extra,
113				       int non_match, reg_errcode_t *err);
114static bin_tree_t *create_tree (re_dfa_t *dfa,
115				bin_tree_t *left, bin_tree_t *right,
116				re_token_type_t type);
117static bin_tree_t *create_token_tree (re_dfa_t *dfa,
118				      bin_tree_t *left, bin_tree_t *right,
119				      const re_token_t *token);
120static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
121static void free_token (re_token_t *node);
122static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
123static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124
125/* This table gives an error message for each of the error codes listed
126   in regex.h.  Obviously the order here has to be same as there.
127   POSIX doesn't require that we do anything for REG_NOERROR,
128   but why not be nice?  */
129
130const char __re_error_msgid[] attribute_hidden =
131  {
132#define REG_NOERROR_IDX	0
133    gettext_noop ("Success")	/* REG_NOERROR */
134    "\0"
135#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136    gettext_noop ("No match")	/* REG_NOMATCH */
137    "\0"
138#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
139    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140    "\0"
141#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143    "\0"
144#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146    "\0"
147#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
148    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149    "\0"
150#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
151    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152    "\0"
153#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
154    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
155    "\0"
156#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158    "\0"
159#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161    "\0"
162#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
163    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164    "\0"
165#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166    gettext_noop ("Invalid range end")	/* REG_ERANGE */
167    "\0"
168#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
169    gettext_noop ("Memory exhausted") /* REG_ESPACE */
170    "\0"
171#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
172    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173    "\0"
174#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175    gettext_noop ("Premature end of regular expression") /* REG_EEND */
176    "\0"
177#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
178    gettext_noop ("Regular expression too big") /* REG_ESIZE */
179    "\0"
180#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
181    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182  };
183
184const size_t __re_error_msgid_idx[] attribute_hidden =
185  {
186    REG_NOERROR_IDX,
187    REG_NOMATCH_IDX,
188    REG_BADPAT_IDX,
189    REG_ECOLLATE_IDX,
190    REG_ECTYPE_IDX,
191    REG_EESCAPE_IDX,
192    REG_ESUBREG_IDX,
193    REG_EBRACK_IDX,
194    REG_EPAREN_IDX,
195    REG_EBRACE_IDX,
196    REG_BADBR_IDX,
197    REG_ERANGE_IDX,
198    REG_ESPACE_IDX,
199    REG_BADRPT_IDX,
200    REG_EEND_IDX,
201    REG_ESIZE_IDX,
202    REG_ERPAREN_IDX
203  };
204
205/* Entry points for GNU code.  */
206
207/* re_compile_pattern is the GNU regular expression compiler: it
208   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209   Returns 0 if the pattern was valid, otherwise an error string.
210
211   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212   are set in BUFP on entry.  */
213
214const char *
215re_compile_pattern (pattern, length, bufp)
216    const char *pattern;
217    size_t length;
218    struct re_pattern_buffer *bufp;
219{
220  reg_errcode_t ret;
221
222  /* And GNU code determines whether or not to get register information
223     by passing null for the REGS argument to re_match, etc., not by
224     setting no_sub, unless RE_NO_SUB is set.  */
225  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227  /* Match anchors at newline.  */
228  bufp->newline_anchor = 1;
229
230  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232  if (!ret)
233    return NULL;
234  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235}
236#ifdef _LIBC
237weak_alias (__re_compile_pattern, re_compile_pattern)
238#endif
239
240/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
241   also be assigned to arbitrarily: each pattern buffer stores its own
242   syntax, so it can be changed between regex compilations.  */
243/* This has no initializer because initialized variables in Emacs
244   become read-only after dumping.  */
245reg_syntax_t re_syntax_options;
246
247
248/* Specify the precise syntax of regexps for compilation.  This provides
249   for compatibility for various utilities which historically have
250   different, incompatible syntaxes.
251
252   The argument SYNTAX is a bit mask comprised of the various bits
253   defined in regex.h.  We return the old syntax.  */
254
255reg_syntax_t
256re_set_syntax (syntax)
257    reg_syntax_t syntax;
258{
259  reg_syntax_t ret = re_syntax_options;
260
261  re_syntax_options = syntax;
262  return ret;
263}
264#ifdef _LIBC
265weak_alias (__re_set_syntax, re_set_syntax)
266#endif
267
268int
269re_compile_fastmap (bufp)
270    struct re_pattern_buffer *bufp;
271{
272  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273  char *fastmap = bufp->fastmap;
274
275  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277  if (dfa->init_state != dfa->init_state_word)
278    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279  if (dfa->init_state != dfa->init_state_nl)
280    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281  if (dfa->init_state != dfa->init_state_begbuf)
282    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283  bufp->fastmap_accurate = 1;
284  return 0;
285}
286#ifdef _LIBC
287weak_alias (__re_compile_fastmap, re_compile_fastmap)
288#endif
289
290static inline void
291__attribute ((always_inline))
292re_set_fastmap (char *fastmap, int icase, int ch)
293{
294  fastmap[ch] = 1;
295  if (icase)
296    fastmap[tolower (ch)] = 1;
297}
298
299/* Helper function for re_compile_fastmap.
300   Compile fastmap for the initial_state INIT_STATE.  */
301
302static void
303re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
304			 char *fastmap)
305{
306  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
307  int node_cnt;
308  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
309  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
310    {
311      int node = init_state->nodes.elems[node_cnt];
312      re_token_type_t type = dfa->nodes[node].type;
313
314      if (type == CHARACTER)
315	{
316	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317#ifdef RE_ENABLE_I18N
318	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
319	    {
320	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
321	      wchar_t wc;
322	      mbstate_t state;
323
324	      p = buf;
325	      *p++ = dfa->nodes[node].opr.c;
326	      while (++node < dfa->nodes_len
327		     &&	dfa->nodes[node].type == CHARACTER
328		     && dfa->nodes[node].mb_partial)
329		*p++ = dfa->nodes[node].opr.c;
330	      memset (&state, '\0', sizeof (state));
331	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
332			     &state) == p - buf
333		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
334		      != (size_t) -1))
335		re_set_fastmap (fastmap, 0, buf[0]);
336	    }
337#endif
338	}
339      else if (type == SIMPLE_BRACKET)
340	{
341	  int i, ch;
342	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343	    {
344	      int j;
345	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
346	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
347		if (w & ((bitset_word_t) 1 << j))
348		  re_set_fastmap (fastmap, icase, ch);
349	    }
350	}
351#ifdef RE_ENABLE_I18N
352      else if (type == COMPLEX_BRACKET)
353	{
354	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355	  int i;
356
357# ifdef _LIBC
358	  /* See if we have to try all bytes which start multiple collation
359	     elements.
360	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361		  collation element, and don't catch 'b' since 'b' is
362		  the only collation element which starts from 'b' (and
363		  it is caught by SIMPLE_BRACKET).  */
364	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
365		  && (cset->ncoll_syms || cset->nranges))
366		{
367		  const int32_t *table = (const int32_t *)
368		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369		  for (i = 0; i < SBC_MAX; ++i)
370		    if (table[i] < 0)
371		      re_set_fastmap (fastmap, icase, i);
372		}
373# endif /* _LIBC */
374
375	  /* See if we have to start the match at all multibyte characters,
376	     i.e. where we would not find an invalid sequence.  This only
377	     applies to multibyte character sets; for single byte character
378	     sets, the SIMPLE_BRACKET again suffices.  */
379	  if (dfa->mb_cur_max > 1
380	      && (cset->nchar_classes || cset->non_match
381# ifdef _LIBC
382		  || cset->nequiv_classes
383# endif /* _LIBC */
384		 ))
385	    {
386	      unsigned char c = 0;
387	      do
388		{
389		  mbstate_t mbs;
390		  memset (&mbs, 0, sizeof (mbs));
391		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
392		    re_set_fastmap (fastmap, false, (int) c);
393		}
394	      while (++c != 0);
395	    }
396
397	  else
398	    {
399	      /* ... Else catch all bytes which can start the mbchars.  */
400	      for (i = 0; i < cset->nmbchars; ++i)
401		{
402		  char buf[256];
403		  mbstate_t state;
404		  memset (&state, '\0', sizeof (state));
405		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
406		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
407		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
408		    {
409		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
410			  != (size_t) -1)
411			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
412		    }
413 		}
414 	    }
415	}
416#endif /* RE_ENABLE_I18N */
417      else if (type == OP_PERIOD
418#ifdef RE_ENABLE_I18N
419	       || type == OP_UTF8_PERIOD
420#endif /* RE_ENABLE_I18N */
421	       || type == END_OF_RE)
422	{
423	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
424	  if (type == END_OF_RE)
425	    bufp->can_be_null = 1;
426	  return;
427	}
428    }
429}
430
431/* Entry point for POSIX code.  */
432/* regcomp takes a regular expression as a string and compiles it.
433
434   PREG is a regex_t *.  We do not expect any fields to be initialized,
435   since POSIX says we shouldn't.  Thus, we set
436
437     `buffer' to the compiled pattern;
438     `used' to the length of the compiled pattern;
439     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440       REG_EXTENDED bit in CFLAGS is set; otherwise, to
441       RE_SYNTAX_POSIX_BASIC;
442     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
443     `fastmap' to an allocated space for the fastmap;
444     `fastmap_accurate' to zero;
445     `re_nsub' to the number of subexpressions in PATTERN.
446
447   PATTERN is the address of the pattern string.
448
449   CFLAGS is a series of bits which affect compilation.
450
451     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452     use POSIX basic syntax.
453
454     If REG_NEWLINE is set, then . and [^...] don't match newline.
455     Also, regexec will try a match beginning after every newline.
456
457     If REG_ICASE is set, then we considers upper- and lowercase
458     versions of letters to be equivalent when matching.
459
460     If REG_NOSUB is set, then when PREG is passed to regexec, that
461     routine will report only success or failure, and nothing about the
462     registers.
463
464   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
465   the return codes and their meanings.)  */
466
467int
468regcomp (preg, pattern, cflags)
469    regex_t *__restrict preg;
470    const char *__restrict pattern;
471    int cflags;
472{
473  reg_errcode_t ret;
474  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
475			 : RE_SYNTAX_POSIX_BASIC);
476
477  preg->buffer = NULL;
478  preg->allocated = 0;
479  preg->used = 0;
480
481  /* Try to allocate space for the fastmap.  */
482  preg->fastmap = re_malloc (char, SBC_MAX);
483  if (BE (preg->fastmap == NULL, 0))
484    return REG_ESPACE;
485
486  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
487
488  /* If REG_NEWLINE is set, newlines are treated differently.  */
489  if (cflags & REG_NEWLINE)
490    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
491      syntax &= ~RE_DOT_NEWLINE;
492      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
493      /* It also changes the matching behavior.  */
494      preg->newline_anchor = 1;
495    }
496  else
497    preg->newline_anchor = 0;
498  preg->no_sub = !!(cflags & REG_NOSUB);
499  preg->translate = NULL;
500
501  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
502
503  /* POSIX doesn't distinguish between an unmatched open-group and an
504     unmatched close-group: both are REG_EPAREN.  */
505  if (ret == REG_ERPAREN)
506    ret = REG_EPAREN;
507
508  /* We have already checked preg->fastmap != NULL.  */
509  if (BE (ret == REG_NOERROR, 1))
510    /* Compute the fastmap now, since regexec cannot modify the pattern
511       buffer.  This function never fails in this implementation.  */
512    (void) re_compile_fastmap (preg);
513  else
514    {
515      /* Some error occurred while compiling the expression.  */
516      re_free (preg->fastmap);
517      preg->fastmap = NULL;
518    }
519
520  return (int) ret;
521}
522#ifdef _LIBC
523weak_alias (__regcomp, regcomp)
524#endif
525
526/* Returns a message corresponding to an error code, ERRCODE, returned
527   from either regcomp or regexec.   We don't use PREG here.  */
528
529size_t
530regerror (errcode, preg, errbuf, errbuf_size)
531    int errcode;
532    const regex_t *__restrict preg;
533    char *__restrict errbuf;
534    size_t errbuf_size;
535{
536  const char *msg;
537  size_t msg_size;
538
539  if (BE (errcode < 0
540	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
541			       / sizeof (__re_error_msgid_idx[0])), 0))
542    /* Only error codes returned by the rest of the code should be passed
543       to this routine.  If we are given anything else, or if other regex
544       code generates an invalid error code, then the program has a bug.
545       Dump core so we can fix it.  */
546    abort ();
547
548  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
549
550  msg_size = strlen (msg) + 1; /* Includes the null.  */
551
552  if (BE (errbuf_size != 0, 1))
553    {
554      if (BE (msg_size > errbuf_size, 0))
555	{
556#if defined HAVE_MEMPCPY || defined _LIBC
557	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
558#else
559	  memcpy (errbuf, msg, errbuf_size - 1);
560	  errbuf[errbuf_size - 1] = 0;
561#endif
562	}
563      else
564	memcpy (errbuf, msg, msg_size);
565    }
566
567  return msg_size;
568}
569#ifdef _LIBC
570weak_alias (__regerror, regerror)
571#endif
572
573
574#ifdef RE_ENABLE_I18N
575/* This static array is used for the map to single-byte characters when
576   UTF-8 is used.  Otherwise we would allocate memory just to initialize
577   it the same all the time.  UTF-8 is the preferred encoding so this is
578   a worthwhile optimization.  */
579static const bitset_t utf8_sb_map =
580{
581  /* Set the first 128 bits.  */
582  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
583};
584#endif
585
586
587static void
588free_dfa_content (re_dfa_t *dfa)
589{
590  int i, j;
591
592  if (dfa->nodes)
593    for (i = 0; i < dfa->nodes_len; ++i)
594      free_token (dfa->nodes + i);
595  re_free (dfa->nexts);
596  for (i = 0; i < dfa->nodes_len; ++i)
597    {
598      if (dfa->eclosures != NULL)
599	re_node_set_free (dfa->eclosures + i);
600      if (dfa->inveclosures != NULL)
601	re_node_set_free (dfa->inveclosures + i);
602      if (dfa->edests != NULL)
603	re_node_set_free (dfa->edests + i);
604    }
605  re_free (dfa->edests);
606  re_free (dfa->eclosures);
607  re_free (dfa->inveclosures);
608  re_free (dfa->nodes);
609
610  if (dfa->state_table)
611    for (i = 0; i <= dfa->state_hash_mask; ++i)
612      {
613	struct re_state_table_entry *entry = dfa->state_table + i;
614	for (j = 0; j < entry->num; ++j)
615	  {
616	    re_dfastate_t *state = entry->array[j];
617	    free_state (state);
618	  }
619        re_free (entry->array);
620      }
621  re_free (dfa->state_table);
622#ifdef RE_ENABLE_I18N
623  if (dfa->sb_char != utf8_sb_map)
624    re_free (dfa->sb_char);
625#endif
626  re_free (dfa->subexp_map);
627#ifdef DEBUG
628  re_free (dfa->re_str);
629#endif
630
631  re_free (dfa);
632}
633
634
635/* Free dynamically allocated space used by PREG.  */
636
637void
638regfree (preg)
639    regex_t *preg;
640{
641  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
642  if (BE (dfa != NULL, 1))
643    free_dfa_content (dfa);
644  preg->buffer = NULL;
645  preg->allocated = 0;
646
647  re_free (preg->fastmap);
648  preg->fastmap = NULL;
649
650  re_free (preg->translate);
651  preg->translate = NULL;
652}
653#ifdef _LIBC
654weak_alias (__regfree, regfree)
655#endif
656
657/* Entry points compatible with 4.2 BSD regex library.  We don't define
658   them unless specifically requested.  */
659
660#if defined _REGEX_RE_COMP || defined _LIBC
661
662/* BSD has one and only one pattern buffer.  */
663static struct re_pattern_buffer re_comp_buf;
664
665char *
666# ifdef _LIBC
667/* Make these definitions weak in libc, so POSIX programs can redefine
668   these names if they don't use our functions, and still use
669   regcomp/regexec above without link errors.  */
670weak_function
671# endif
672re_comp (s)
673     const char *s;
674{
675  reg_errcode_t ret;
676  char *fastmap;
677
678  if (!s)
679    {
680      if (!re_comp_buf.buffer)
681	return gettext ("No previous regular expression");
682      return 0;
683    }
684
685  if (re_comp_buf.buffer)
686    {
687      fastmap = re_comp_buf.fastmap;
688      re_comp_buf.fastmap = NULL;
689      __regfree (&re_comp_buf);
690      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
691      re_comp_buf.fastmap = fastmap;
692    }
693
694  if (re_comp_buf.fastmap == NULL)
695    {
696      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
697      if (re_comp_buf.fastmap == NULL)
698	return (char *) gettext (__re_error_msgid
699				 + __re_error_msgid_idx[(int) REG_ESPACE]);
700    }
701
702  /* Since `re_exec' always passes NULL for the `regs' argument, we
703     don't need to initialize the pattern buffer fields which affect it.  */
704
705  /* Match anchors at newlines.  */
706  re_comp_buf.newline_anchor = 1;
707
708  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
709
710  if (!ret)
711    return NULL;
712
713  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
714  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
715}
716
717#ifdef _LIBC
718libc_freeres_fn (free_mem)
719{
720  __regfree (&re_comp_buf);
721}
722#endif
723
724#endif /* _REGEX_RE_COMP */
725
726/* Internal entry point.
727   Compile the regular expression PATTERN, whose length is LENGTH.
728   SYNTAX indicate regular expression's syntax.  */
729
730static reg_errcode_t
731re_compile_internal (regex_t *preg, const char * pattern, size_t length,
732		     reg_syntax_t syntax)
733{
734  reg_errcode_t err = REG_NOERROR;
735  re_dfa_t *dfa;
736  re_string_t regexp;
737
738  /* Initialize the pattern buffer.  */
739  preg->fastmap_accurate = 0;
740  preg->syntax = syntax;
741  preg->not_bol = preg->not_eol = 0;
742  preg->used = 0;
743  preg->re_nsub = 0;
744  preg->can_be_null = 0;
745  preg->regs_allocated = REGS_UNALLOCATED;
746
747  /* Initialize the dfa.  */
748  dfa = (re_dfa_t *) preg->buffer;
749  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
750    {
751      /* If zero allocated, but buffer is non-null, try to realloc
752	 enough space.  This loses if buffer's address is bogus, but
753	 that is the user's responsibility.  If ->buffer is NULL this
754	 is a simple allocation.  */
755      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
756      if (dfa == NULL)
757	return REG_ESPACE;
758      preg->allocated = sizeof (re_dfa_t);
759      preg->buffer = (unsigned char *) dfa;
760    }
761  preg->used = sizeof (re_dfa_t);
762
763  err = init_dfa (dfa, length);
764  if (BE (err != REG_NOERROR, 0))
765    {
766      free_dfa_content (dfa);
767      preg->buffer = NULL;
768      preg->allocated = 0;
769      return err;
770    }
771#ifdef DEBUG
772  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
773  dfa->re_str = re_malloc (char, length + 1);
774  strncpy (dfa->re_str, pattern, length + 1);
775#endif
776
777  __libc_lock_init (dfa->lock);
778
779  err = re_string_construct (&regexp, pattern, length, preg->translate,
780			     syntax & RE_ICASE, dfa);
781  if (BE (err != REG_NOERROR, 0))
782    {
783    re_compile_internal_free_return:
784      free_workarea_compile (preg);
785      re_string_destruct (&regexp);
786      free_dfa_content (dfa);
787      preg->buffer = NULL;
788      preg->allocated = 0;
789      return err;
790    }
791
792  /* Parse the regular expression, and build a structure tree.  */
793  preg->re_nsub = 0;
794  dfa->str_tree = parse (&regexp, preg, syntax, &err);
795  if (BE (dfa->str_tree == NULL, 0))
796    goto re_compile_internal_free_return;
797
798  /* Analyze the tree and create the nfa.  */
799  err = analyze (preg);
800  if (BE (err != REG_NOERROR, 0))
801    goto re_compile_internal_free_return;
802
803#ifdef RE_ENABLE_I18N
804  /* If possible, do searching in single byte encoding to speed things up.  */
805  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
806    optimize_utf8 (dfa);
807#endif
808
809  /* Then create the initial state of the dfa.  */
810  err = create_initial_state (dfa);
811
812  /* Release work areas.  */
813  free_workarea_compile (preg);
814  re_string_destruct (&regexp);
815
816  if (BE (err != REG_NOERROR, 0))
817    {
818      free_dfa_content (dfa);
819      preg->buffer = NULL;
820      preg->allocated = 0;
821    }
822
823  return err;
824}
825
826/* Initialize DFA.  We use the length of the regular expression PAT_LEN
827   as the initial length of some arrays.  */
828
829static reg_errcode_t
830init_dfa (re_dfa_t *dfa, size_t pat_len)
831{
832  unsigned int table_size;
833#ifndef _LIBC
834  char *codeset_name;
835#endif
836
837  memset (dfa, '\0', sizeof (re_dfa_t));
838
839  /* Force allocation of str_tree_storage the first time.  */
840  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
841
842  /* Avoid overflows.  */
843  if (pat_len == SIZE_MAX)
844    return REG_ESPACE;
845
846  dfa->nodes_alloc = pat_len + 1;
847  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
848
849  /*  table_size = 2 ^ ceil(log pat_len) */
850  for (table_size = 1; ; table_size <<= 1)
851    if (table_size > pat_len)
852      break;
853
854  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
855  dfa->state_hash_mask = table_size - 1;
856
857  dfa->mb_cur_max = MB_CUR_MAX;
858#ifdef _LIBC
859  if (dfa->mb_cur_max == 6
860      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
861    dfa->is_utf8 = 1;
862  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
863		       != 0);
864#else
865# ifdef HAVE_LANGINFO_CODESET
866  codeset_name = nl_langinfo (CODESET);
867# else
868  codeset_name = getenv ("LC_ALL");
869  if (codeset_name == NULL || codeset_name[0] == '\0')
870    codeset_name = getenv ("LC_CTYPE");
871  if (codeset_name == NULL || codeset_name[0] == '\0')
872    codeset_name = getenv ("LANG");
873  if (codeset_name == NULL)
874    codeset_name = "";
875  else if (strchr (codeset_name, '.') !=  NULL)
876    codeset_name = strchr (codeset_name, '.') + 1;
877# endif
878
879  if (strcasecmp (codeset_name, "UTF-8") == 0
880      || strcasecmp (codeset_name, "UTF8") == 0)
881    dfa->is_utf8 = 1;
882
883  /* We check exhaustively in the loop below if this charset is a
884     superset of ASCII.  */
885  dfa->map_notascii = 0;
886#endif
887
888#ifdef RE_ENABLE_I18N
889  if (dfa->mb_cur_max > 1)
890    {
891      if (dfa->is_utf8)
892	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
893      else
894	{
895	  int i, j, ch;
896
897	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
898	  if (BE (dfa->sb_char == NULL, 0))
899	    return REG_ESPACE;
900
901	  /* Set the bits corresponding to single byte chars.  */
902	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
903	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
904	      {
905		wint_t wch = __btowc (ch);
906		if (wch != WEOF)
907		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
908# ifndef _LIBC
909		if (isascii (ch) && wch != ch)
910		  dfa->map_notascii = 1;
911# endif
912	      }
913	}
914    }
915#endif
916
917  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
918    return REG_ESPACE;
919  return REG_NOERROR;
920}
921
922/* Initialize WORD_CHAR table, which indicate which character is
923   "word".  In this case "word" means that it is the word construction
924   character used by some operators like "\<", "\>", etc.  */
925
926static void
927internal_function
928init_word_char (re_dfa_t *dfa)
929{
930  int i, j, ch;
931  dfa->word_ops_used = 1;
932  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
933    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
934      if (isalnum (ch) || ch == '_')
935	dfa->word_char[i] |= (bitset_word_t) 1 << j;
936}
937
938/* Free the work area which are only used while compiling.  */
939
940static void
941free_workarea_compile (regex_t *preg)
942{
943  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
944  bin_tree_storage_t *storage, *next;
945  for (storage = dfa->str_tree_storage; storage; storage = next)
946    {
947      next = storage->next;
948      re_free (storage);
949    }
950  dfa->str_tree_storage = NULL;
951  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
952  dfa->str_tree = NULL;
953  re_free (dfa->org_indices);
954  dfa->org_indices = NULL;
955}
956
957/* Create initial states for all contexts.  */
958
959static reg_errcode_t
960create_initial_state (re_dfa_t *dfa)
961{
962  int first, i;
963  reg_errcode_t err;
964  re_node_set init_nodes;
965
966  /* Initial states have the epsilon closure of the node which is
967     the first node of the regular expression.  */
968  first = dfa->str_tree->first->node_idx;
969  dfa->init_node = first;
970  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
971  if (BE (err != REG_NOERROR, 0))
972    return err;
973
974  /* The back-references which are in initial states can epsilon transit,
975     since in this case all of the subexpressions can be null.
976     Then we add epsilon closures of the nodes which are the next nodes of
977     the back-references.  */
978  if (dfa->nbackref > 0)
979    for (i = 0; i < init_nodes.nelem; ++i)
980      {
981	int node_idx = init_nodes.elems[i];
982	re_token_type_t type = dfa->nodes[node_idx].type;
983
984	int clexp_idx;
985	if (type != OP_BACK_REF)
986	  continue;
987	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
988	  {
989	    re_token_t *clexp_node;
990	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
991	    if (clexp_node->type == OP_CLOSE_SUBEXP
992		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
993	      break;
994	  }
995	if (clexp_idx == init_nodes.nelem)
996	  continue;
997
998	if (type == OP_BACK_REF)
999	  {
1000	    int dest_idx = dfa->edests[node_idx].elems[0];
1001	    if (!re_node_set_contains (&init_nodes, dest_idx))
1002	      {
1003		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1004		i = 0;
1005	      }
1006	  }
1007      }
1008
1009  /* It must be the first time to invoke acquire_state.  */
1010  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1011  /* We don't check ERR here, since the initial state must not be NULL.  */
1012  if (BE (dfa->init_state == NULL, 0))
1013    return err;
1014  if (dfa->init_state->has_constraint)
1015    {
1016      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1017						       CONTEXT_WORD);
1018      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1019						     CONTEXT_NEWLINE);
1020      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1021							 &init_nodes,
1022							 CONTEXT_NEWLINE
1023							 | CONTEXT_BEGBUF);
1024      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1025	      || dfa->init_state_begbuf == NULL, 0))
1026	return err;
1027    }
1028  else
1029    dfa->init_state_word = dfa->init_state_nl
1030      = dfa->init_state_begbuf = dfa->init_state;
1031
1032  re_node_set_free (&init_nodes);
1033  return REG_NOERROR;
1034}
1035
1036#ifdef RE_ENABLE_I18N
1037/* If it is possible to do searching in single byte encoding instead of UTF-8
1038   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1039   DFA nodes where needed.  */
1040
1041static void
1042optimize_utf8 (re_dfa_t *dfa)
1043{
1044  int node, i, mb_chars = 0, has_period = 0;
1045
1046  for (node = 0; node < dfa->nodes_len; ++node)
1047    switch (dfa->nodes[node].type)
1048      {
1049      case CHARACTER:
1050	if (dfa->nodes[node].opr.c >= 0x80)
1051	  mb_chars = 1;
1052	break;
1053      case ANCHOR:
1054	switch (dfa->nodes[node].opr.ctx_type)
1055	  {
1056	  case LINE_FIRST:
1057	  case LINE_LAST:
1058	  case BUF_FIRST:
1059	  case BUF_LAST:
1060	    break;
1061	  default:
1062	    /* Word anchors etc. cannot be handled.  It's okay to test
1063	       opr.ctx_type since constraints (for all DFA nodes) are
1064	       created by ORing one or more opr.ctx_type values.  */
1065	    return;
1066	  }
1067	break;
1068      case OP_PERIOD:
1069        has_period = 1;
1070        break;
1071      case OP_BACK_REF:
1072      case OP_ALT:
1073      case END_OF_RE:
1074      case OP_DUP_ASTERISK:
1075      case OP_OPEN_SUBEXP:
1076      case OP_CLOSE_SUBEXP:
1077	break;
1078      case COMPLEX_BRACKET:
1079	return;
1080      case SIMPLE_BRACKET:
1081	/* Just double check.  The non-ASCII range starts at 0x80.  */
1082	assert (0x80 % BITSET_WORD_BITS == 0);
1083        for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1084	  if (dfa->nodes[node].opr.sbcset[i])
1085	    return;
1086	break;
1087      default:
1088	abort ();
1089      }
1090
1091  if (mb_chars || has_period)
1092    for (node = 0; node < dfa->nodes_len; ++node)
1093      {
1094	if (dfa->nodes[node].type == CHARACTER
1095	    && dfa->nodes[node].opr.c >= 0x80)
1096	  dfa->nodes[node].mb_partial = 0;
1097	else if (dfa->nodes[node].type == OP_PERIOD)
1098	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1099      }
1100
1101  /* The search can be in single byte locale.  */
1102  dfa->mb_cur_max = 1;
1103  dfa->is_utf8 = 0;
1104  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1105}
1106#endif
1107
1108/* Analyze the structure tree, and calculate "first", "next", "edest",
1109   "eclosure", and "inveclosure".  */
1110
1111static reg_errcode_t
1112analyze (regex_t *preg)
1113{
1114  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1115  reg_errcode_t ret;
1116
1117  /* Allocate arrays.  */
1118  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1119  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1120  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1121  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1122  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1123	  || dfa->eclosures == NULL, 0))
1124    return REG_ESPACE;
1125
1126  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1127  if (dfa->subexp_map != NULL)
1128    {
1129      int i;
1130      for (i = 0; i < preg->re_nsub; i++)
1131	dfa->subexp_map[i] = i;
1132      preorder (dfa->str_tree, optimize_subexps, dfa);
1133      for (i = 0; i < preg->re_nsub; i++)
1134	if (dfa->subexp_map[i] != i)
1135	  break;
1136      if (i == preg->re_nsub)
1137	{
1138	  free (dfa->subexp_map);
1139	  dfa->subexp_map = NULL;
1140	}
1141    }
1142
1143  ret = postorder (dfa->str_tree, lower_subexps, preg);
1144  if (BE (ret != REG_NOERROR, 0))
1145    return ret;
1146  ret = postorder (dfa->str_tree, calc_first, dfa);
1147  if (BE (ret != REG_NOERROR, 0))
1148    return ret;
1149  preorder (dfa->str_tree, calc_next, dfa);
1150  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1151  if (BE (ret != REG_NOERROR, 0))
1152    return ret;
1153  ret = calc_eclosure (dfa);
1154  if (BE (ret != REG_NOERROR, 0))
1155    return ret;
1156
1157  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1158     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1159  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1160      || dfa->nbackref)
1161    {
1162      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1163      if (BE (dfa->inveclosures == NULL, 0))
1164        return REG_ESPACE;
1165      ret = calc_inveclosure (dfa);
1166    }
1167
1168  return ret;
1169}
1170
1171/* Our parse trees are very unbalanced, so we cannot use a stack to
1172   implement parse tree visits.  Instead, we use parent pointers and
1173   some hairy code in these two functions.  */
1174static reg_errcode_t
1175postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1176	   void *extra)
1177{
1178  bin_tree_t *node, *prev;
1179
1180  for (node = root; ; )
1181    {
1182      /* Descend down the tree, preferably to the left (or to the right
1183	 if that's the only child).  */
1184      while (node->left || node->right)
1185	if (node->left)
1186          node = node->left;
1187        else
1188          node = node->right;
1189
1190      do
1191	{
1192	  reg_errcode_t err = fn (extra, node);
1193	  if (BE (err != REG_NOERROR, 0))
1194	    return err;
1195          if (node->parent == NULL)
1196	    return REG_NOERROR;
1197	  prev = node;
1198	  node = node->parent;
1199	}
1200      /* Go up while we have a node that is reached from the right.  */
1201      while (node->right == prev || node->right == NULL);
1202      node = node->right;
1203    }
1204}
1205
1206static reg_errcode_t
1207preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1208	  void *extra)
1209{
1210  bin_tree_t *node;
1211
1212  for (node = root; ; )
1213    {
1214      reg_errcode_t err = fn (extra, node);
1215      if (BE (err != REG_NOERROR, 0))
1216	return err;
1217
1218      /* Go to the left node, or up and to the right.  */
1219      if (node->left)
1220	node = node->left;
1221      else
1222	{
1223	  bin_tree_t *prev = NULL;
1224	  while (node->right == prev || node->right == NULL)
1225	    {
1226	      prev = node;
1227	      node = node->parent;
1228	      if (!node)
1229	        return REG_NOERROR;
1230	    }
1231	  node = node->right;
1232	}
1233    }
1234}
1235
1236/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1237   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1238   backreferences as well.  Requires a preorder visit.  */
1239static reg_errcode_t
1240optimize_subexps (void *extra, bin_tree_t *node)
1241{
1242  re_dfa_t *dfa = (re_dfa_t *) extra;
1243
1244  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1245    {
1246      int idx = node->token.opr.idx;
1247      node->token.opr.idx = dfa->subexp_map[idx];
1248      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1249    }
1250
1251  else if (node->token.type == SUBEXP
1252           && node->left && node->left->token.type == SUBEXP)
1253    {
1254      int other_idx = node->left->token.opr.idx;
1255
1256      node->left = node->left->left;
1257      if (node->left)
1258        node->left->parent = node;
1259
1260      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1261      if (other_idx < BITSET_WORD_BITS)
1262	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1263    }
1264
1265  return REG_NOERROR;
1266}
1267
1268/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1269   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1270static reg_errcode_t
1271lower_subexps (void *extra, bin_tree_t *node)
1272{
1273  regex_t *preg = (regex_t *) extra;
1274  reg_errcode_t err = REG_NOERROR;
1275
1276  if (node->left && node->left->token.type == SUBEXP)
1277    {
1278      node->left = lower_subexp (&err, preg, node->left);
1279      if (node->left)
1280	node->left->parent = node;
1281    }
1282  if (node->right && node->right->token.type == SUBEXP)
1283    {
1284      node->right = lower_subexp (&err, preg, node->right);
1285      if (node->right)
1286	node->right->parent = node;
1287    }
1288
1289  return err;
1290}
1291
1292static bin_tree_t *
1293lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1294{
1295  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1296  bin_tree_t *body = node->left;
1297  bin_tree_t *op, *cls, *tree1, *tree;
1298
1299  if (preg->no_sub
1300      /* We do not optimize empty subexpressions, because otherwise we may
1301	 have bad CONCAT nodes with NULL children.  This is obviously not
1302	 very common, so we do not lose much.  An example that triggers
1303	 this case is the sed "script" /\(\)/x.  */
1304      && node->left != NULL
1305      && (node->token.opr.idx >= BITSET_WORD_BITS
1306	  || !(dfa->used_bkref_map
1307	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1308    return node->left;
1309
1310  /* Convert the SUBEXP node to the concatenation of an
1311     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1312  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1313  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1314  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1315  tree = create_tree (dfa, op, tree1, CONCAT);
1316  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1317    {
1318      *err = REG_ESPACE;
1319      return NULL;
1320    }
1321
1322  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1323  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1324  return tree;
1325}
1326
1327/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1328   nodes.  Requires a postorder visit.  */
1329static reg_errcode_t
1330calc_first (void *extra, bin_tree_t *node)
1331{
1332  re_dfa_t *dfa = (re_dfa_t *) extra;
1333  if (node->token.type == CONCAT)
1334    {
1335      node->first = node->left->first;
1336      node->node_idx = node->left->node_idx;
1337    }
1338  else
1339    {
1340      node->first = node;
1341      node->node_idx = re_dfa_add_node (dfa, node->token);
1342      if (BE (node->node_idx == -1, 0))
1343        return REG_ESPACE;
1344      if (node->token.type == ANCHOR)
1345        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1346    }
1347  return REG_NOERROR;
1348}
1349
1350/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1351static reg_errcode_t
1352calc_next (void *extra, bin_tree_t *node)
1353{
1354  switch (node->token.type)
1355    {
1356    case OP_DUP_ASTERISK:
1357      node->left->next = node;
1358      break;
1359    case CONCAT:
1360      node->left->next = node->right->first;
1361      node->right->next = node->next;
1362      break;
1363    default:
1364      if (node->left)
1365	node->left->next = node->next;
1366      if (node->right)
1367        node->right->next = node->next;
1368      break;
1369    }
1370  return REG_NOERROR;
1371}
1372
1373/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1374static reg_errcode_t
1375link_nfa_nodes (void *extra, bin_tree_t *node)
1376{
1377  re_dfa_t *dfa = (re_dfa_t *) extra;
1378  int idx = node->node_idx;
1379  reg_errcode_t err = REG_NOERROR;
1380
1381  switch (node->token.type)
1382    {
1383    case CONCAT:
1384      break;
1385
1386    case END_OF_RE:
1387      assert (node->next == NULL);
1388      break;
1389
1390    case OP_DUP_ASTERISK:
1391    case OP_ALT:
1392      {
1393	int left, right;
1394	dfa->has_plural_match = 1;
1395	if (node->left != NULL)
1396	  left = node->left->first->node_idx;
1397	else
1398	  left = node->next->node_idx;
1399	if (node->right != NULL)
1400	  right = node->right->first->node_idx;
1401	else
1402	  right = node->next->node_idx;
1403	assert (left > -1);
1404	assert (right > -1);
1405	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1406      }
1407      break;
1408
1409    case ANCHOR:
1410    case OP_OPEN_SUBEXP:
1411    case OP_CLOSE_SUBEXP:
1412      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1413      break;
1414
1415    case OP_BACK_REF:
1416      dfa->nexts[idx] = node->next->node_idx;
1417      if (node->token.type == OP_BACK_REF)
1418	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1419      break;
1420
1421    default:
1422      assert (!IS_EPSILON_NODE (node->token.type));
1423      dfa->nexts[idx] = node->next->node_idx;
1424      break;
1425    }
1426
1427  return err;
1428}
1429
1430/* Duplicate the epsilon closure of the node ROOT_NODE.
1431   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1432   to their own constraint.  */
1433
1434static reg_errcode_t
1435internal_function
1436duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1437			int root_node, unsigned int init_constraint)
1438{
1439  int org_node, clone_node, ret;
1440  unsigned int constraint = init_constraint;
1441  for (org_node = top_org_node, clone_node = top_clone_node;;)
1442    {
1443      int org_dest, clone_dest;
1444      if (dfa->nodes[org_node].type == OP_BACK_REF)
1445	{
1446	  /* If the back reference epsilon-transit, its destination must
1447	     also have the constraint.  Then duplicate the epsilon closure
1448	     of the destination of the back reference, and store it in
1449	     edests of the back reference.  */
1450	  org_dest = dfa->nexts[org_node];
1451	  re_node_set_empty (dfa->edests + clone_node);
1452	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1453	  if (BE (clone_dest == -1, 0))
1454	    return REG_ESPACE;
1455	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1456	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1457	  if (BE (ret < 0, 0))
1458	    return REG_ESPACE;
1459	}
1460      else if (dfa->edests[org_node].nelem == 0)
1461	{
1462	  /* In case of the node can't epsilon-transit, don't duplicate the
1463	     destination and store the original destination as the
1464	     destination of the node.  */
1465	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1466	  break;
1467	}
1468      else if (dfa->edests[org_node].nelem == 1)
1469	{
1470	  /* In case of the node can epsilon-transit, and it has only one
1471	     destination.  */
1472	  org_dest = dfa->edests[org_node].elems[0];
1473	  re_node_set_empty (dfa->edests + clone_node);
1474	  /* If the node is root_node itself, it means the epsilon clsoure
1475	     has a loop.   Then tie it to the destination of the root_node.  */
1476	  if (org_node == root_node && clone_node != org_node)
1477	    {
1478	      ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1479	      if (BE (ret < 0, 0))
1480		return REG_ESPACE;
1481	      break;
1482	    }
1483	  /* In case of the node has another constraint, add it.  */
1484	  constraint |= dfa->nodes[org_node].constraint;
1485	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1486	  if (BE (clone_dest == -1, 0))
1487	    return REG_ESPACE;
1488	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489	  if (BE (ret < 0, 0))
1490	    return REG_ESPACE;
1491	}
1492      else /* dfa->edests[org_node].nelem == 2 */
1493	{
1494	  /* In case of the node can epsilon-transit, and it has two
1495	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1496	  org_dest = dfa->edests[org_node].elems[0];
1497	  re_node_set_empty (dfa->edests + clone_node);
1498	  /* Search for a duplicated node which satisfies the constraint.  */
1499	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1500	  if (clone_dest == -1)
1501	    {
1502	      /* There is no such duplicated node, create a new one.  */
1503	      reg_errcode_t err;
1504	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1505	      if (BE (clone_dest == -1, 0))
1506		return REG_ESPACE;
1507	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1508	      if (BE (ret < 0, 0))
1509		return REG_ESPACE;
1510	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1511					    root_node, constraint);
1512	      if (BE (err != REG_NOERROR, 0))
1513		return err;
1514	    }
1515	  else
1516	    {
1517	      /* There is a duplicated node which satisfies the constraint,
1518		 use it to avoid infinite loop.  */
1519	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1520	      if (BE (ret < 0, 0))
1521		return REG_ESPACE;
1522	    }
1523
1524	  org_dest = dfa->edests[org_node].elems[1];
1525	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1526	  if (BE (clone_dest == -1, 0))
1527	    return REG_ESPACE;
1528	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529	  if (BE (ret < 0, 0))
1530	    return REG_ESPACE;
1531	}
1532      org_node = org_dest;
1533      clone_node = clone_dest;
1534    }
1535  return REG_NOERROR;
1536}
1537
1538/* Search for a node which is duplicated from the node ORG_NODE, and
1539   satisfies the constraint CONSTRAINT.  */
1540
1541static int
1542search_duplicated_node (const re_dfa_t *dfa, int org_node,
1543			unsigned int constraint)
1544{
1545  int idx;
1546  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1547    {
1548      if (org_node == dfa->org_indices[idx]
1549	  && constraint == dfa->nodes[idx].constraint)
1550	return idx; /* Found.  */
1551    }
1552  return -1; /* Not found.  */
1553}
1554
1555/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1556   Return the index of the new node, or -1 if insufficient storage is
1557   available.  */
1558
1559static int
1560duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1561{
1562  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1563  if (BE (dup_idx != -1, 1))
1564    {
1565      dfa->nodes[dup_idx].constraint = constraint;
1566      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1567      dfa->nodes[dup_idx].duplicated = 1;
1568
1569      /* Store the index of the original node.  */
1570      dfa->org_indices[dup_idx] = org_idx;
1571    }
1572  return dup_idx;
1573}
1574
1575static reg_errcode_t
1576calc_inveclosure (re_dfa_t *dfa)
1577{
1578  int src, idx, ret;
1579  for (idx = 0; idx < dfa->nodes_len; ++idx)
1580    re_node_set_init_empty (dfa->inveclosures + idx);
1581
1582  for (src = 0; src < dfa->nodes_len; ++src)
1583    {
1584      int *elems = dfa->eclosures[src].elems;
1585      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1586	{
1587	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1588	  if (BE (ret == -1, 0))
1589	    return REG_ESPACE;
1590	}
1591    }
1592
1593  return REG_NOERROR;
1594}
1595
1596/* Calculate "eclosure" for all the node in DFA.  */
1597
1598static reg_errcode_t
1599calc_eclosure (re_dfa_t *dfa)
1600{
1601  int node_idx, incomplete;
1602#ifdef DEBUG
1603  assert (dfa->nodes_len > 0);
1604#endif
1605  incomplete = 0;
1606  /* For each nodes, calculate epsilon closure.  */
1607  for (node_idx = 0; ; ++node_idx)
1608    {
1609      reg_errcode_t err;
1610      re_node_set eclosure_elem;
1611      if (node_idx == dfa->nodes_len)
1612	{
1613	  if (!incomplete)
1614	    break;
1615	  incomplete = 0;
1616	  node_idx = 0;
1617	}
1618
1619#ifdef DEBUG
1620      assert (dfa->eclosures[node_idx].nelem != -1);
1621#endif
1622
1623      /* If we have already calculated, skip it.  */
1624      if (dfa->eclosures[node_idx].nelem != 0)
1625	continue;
1626      /* Calculate epsilon closure of `node_idx'.  */
1627      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1628      if (BE (err != REG_NOERROR, 0))
1629	return err;
1630
1631      if (dfa->eclosures[node_idx].nelem == 0)
1632	{
1633	  incomplete = 1;
1634	  re_node_set_free (&eclosure_elem);
1635	}
1636    }
1637  return REG_NOERROR;
1638}
1639
1640/* Calculate epsilon closure of NODE.  */
1641
1642static reg_errcode_t
1643calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1644{
1645  reg_errcode_t err;
1646  int i, incomplete;
1647  re_node_set eclosure;
1648  incomplete = 0;
1649  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1650  if (BE (err != REG_NOERROR, 0))
1651    return err;
1652
1653  /* This indicates that we are calculating this node now.
1654     We reference this value to avoid infinite loop.  */
1655  dfa->eclosures[node].nelem = -1;
1656
1657  /* If the current node has constraints, duplicate all nodes
1658     since they must inherit the constraints.  */
1659  if (dfa->nodes[node].constraint
1660      && dfa->edests[node].nelem
1661      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1662    {
1663      err = duplicate_node_closure (dfa, node, node, node,
1664				    dfa->nodes[node].constraint);
1665      if (BE (err != REG_NOERROR, 0))
1666	return err;
1667    }
1668
1669  /* Expand each epsilon destination nodes.  */
1670  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1671    for (i = 0; i < dfa->edests[node].nelem; ++i)
1672      {
1673	re_node_set eclosure_elem;
1674	int edest = dfa->edests[node].elems[i];
1675	/* If calculating the epsilon closure of `edest' is in progress,
1676	   return intermediate result.  */
1677	if (dfa->eclosures[edest].nelem == -1)
1678	  {
1679	    incomplete = 1;
1680	    continue;
1681	  }
1682	/* If we haven't calculated the epsilon closure of `edest' yet,
1683	   calculate now. Otherwise use calculated epsilon closure.  */
1684	if (dfa->eclosures[edest].nelem == 0)
1685	  {
1686	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1687	    if (BE (err != REG_NOERROR, 0))
1688	      return err;
1689	  }
1690	else
1691	  eclosure_elem = dfa->eclosures[edest];
1692	/* Merge the epsilon closure of `edest'.  */
1693	re_node_set_merge (&eclosure, &eclosure_elem);
1694	/* If the epsilon closure of `edest' is incomplete,
1695	   the epsilon closure of this node is also incomplete.  */
1696	if (dfa->eclosures[edest].nelem == 0)
1697	  {
1698	    incomplete = 1;
1699	    re_node_set_free (&eclosure_elem);
1700	  }
1701      }
1702
1703  /* Epsilon closures include itself.  */
1704  re_node_set_insert (&eclosure, node);
1705  if (incomplete && !root)
1706    dfa->eclosures[node].nelem = 0;
1707  else
1708    dfa->eclosures[node] = eclosure;
1709  *new_set = eclosure;
1710  return REG_NOERROR;
1711}
1712
1713/* Functions for token which are used in the parser.  */
1714
1715/* Fetch a token from INPUT.
1716   We must not use this function inside bracket expressions.  */
1717
1718static void
1719internal_function
1720fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1721{
1722  re_string_skip_bytes (input, peek_token (result, input, syntax));
1723}
1724
1725/* Peek a token from INPUT, and return the length of the token.
1726   We must not use this function inside bracket expressions.  */
1727
1728static int
1729internal_function
1730peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1731{
1732  unsigned char c;
1733
1734  if (re_string_eoi (input))
1735    {
1736      token->type = END_OF_RE;
1737      return 0;
1738    }
1739
1740  c = re_string_peek_byte (input, 0);
1741  token->opr.c = c;
1742
1743  token->word_char = 0;
1744#ifdef RE_ENABLE_I18N
1745  token->mb_partial = 0;
1746  if (input->mb_cur_max > 1 &&
1747      !re_string_first_byte (input, re_string_cur_idx (input)))
1748    {
1749      token->type = CHARACTER;
1750      token->mb_partial = 1;
1751      return 1;
1752    }
1753#endif
1754  if (c == '\\')
1755    {
1756      unsigned char c2;
1757      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1758	{
1759	  token->type = BACK_SLASH;
1760	  return 1;
1761	}
1762
1763      c2 = re_string_peek_byte_case (input, 1);
1764      token->opr.c = c2;
1765      token->type = CHARACTER;
1766#ifdef RE_ENABLE_I18N
1767      if (input->mb_cur_max > 1)
1768	{
1769	  wint_t wc = re_string_wchar_at (input,
1770					  re_string_cur_idx (input) + 1);
1771	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1772	}
1773      else
1774#endif
1775	token->word_char = IS_WORD_CHAR (c2) != 0;
1776
1777      switch (c2)
1778	{
1779	case '|':
1780	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1781	    token->type = OP_ALT;
1782	  break;
1783	case '1': case '2': case '3': case '4': case '5':
1784	case '6': case '7': case '8': case '9':
1785	  if (!(syntax & RE_NO_BK_REFS))
1786	    {
1787	      token->type = OP_BACK_REF;
1788	      token->opr.idx = c2 - '1';
1789	    }
1790	  break;
1791	case '<':
1792	  if (!(syntax & RE_NO_GNU_OPS))
1793	    {
1794	      token->type = ANCHOR;
1795	      token->opr.ctx_type = WORD_FIRST;
1796	    }
1797	  break;
1798	case '>':
1799	  if (!(syntax & RE_NO_GNU_OPS))
1800	    {
1801	      token->type = ANCHOR;
1802	      token->opr.ctx_type = WORD_LAST;
1803	    }
1804	  break;
1805	case 'b':
1806	  if (!(syntax & RE_NO_GNU_OPS))
1807	    {
1808	      token->type = ANCHOR;
1809	      token->opr.ctx_type = WORD_DELIM;
1810	    }
1811	  break;
1812	case 'B':
1813	  if (!(syntax & RE_NO_GNU_OPS))
1814	    {
1815	      token->type = ANCHOR;
1816	      token->opr.ctx_type = NOT_WORD_DELIM;
1817	    }
1818	  break;
1819	case 'w':
1820	  if (!(syntax & RE_NO_GNU_OPS))
1821	    token->type = OP_WORD;
1822	  break;
1823	case 'W':
1824	  if (!(syntax & RE_NO_GNU_OPS))
1825	    token->type = OP_NOTWORD;
1826	  break;
1827	case 's':
1828	  if (!(syntax & RE_NO_GNU_OPS))
1829	    token->type = OP_SPACE;
1830	  break;
1831	case 'S':
1832	  if (!(syntax & RE_NO_GNU_OPS))
1833	    token->type = OP_NOTSPACE;
1834	  break;
1835	case '`':
1836	  if (!(syntax & RE_NO_GNU_OPS))
1837	    {
1838	      token->type = ANCHOR;
1839	      token->opr.ctx_type = BUF_FIRST;
1840	    }
1841	  break;
1842	case '\'':
1843	  if (!(syntax & RE_NO_GNU_OPS))
1844	    {
1845	      token->type = ANCHOR;
1846	      token->opr.ctx_type = BUF_LAST;
1847	    }
1848	  break;
1849	case '(':
1850	  if (!(syntax & RE_NO_BK_PARENS))
1851	    token->type = OP_OPEN_SUBEXP;
1852	  break;
1853	case ')':
1854	  if (!(syntax & RE_NO_BK_PARENS))
1855	    token->type = OP_CLOSE_SUBEXP;
1856	  break;
1857	case '+':
1858	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1859	    token->type = OP_DUP_PLUS;
1860	  break;
1861	case '?':
1862	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1863	    token->type = OP_DUP_QUESTION;
1864	  break;
1865	case '{':
1866	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1867	    token->type = OP_OPEN_DUP_NUM;
1868	  break;
1869	case '}':
1870	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1871	    token->type = OP_CLOSE_DUP_NUM;
1872	  break;
1873	default:
1874	  break;
1875	}
1876      return 2;
1877    }
1878
1879  token->type = CHARACTER;
1880#ifdef RE_ENABLE_I18N
1881  if (input->mb_cur_max > 1)
1882    {
1883      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1884      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1885    }
1886  else
1887#endif
1888    token->word_char = IS_WORD_CHAR (token->opr.c);
1889
1890  switch (c)
1891    {
1892    case '\n':
1893      if (syntax & RE_NEWLINE_ALT)
1894	token->type = OP_ALT;
1895      break;
1896    case '|':
1897      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1898	token->type = OP_ALT;
1899      break;
1900    case '*':
1901      token->type = OP_DUP_ASTERISK;
1902      break;
1903    case '+':
1904      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1905	token->type = OP_DUP_PLUS;
1906      break;
1907    case '?':
1908      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1909	token->type = OP_DUP_QUESTION;
1910      break;
1911    case '{':
1912      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1913	token->type = OP_OPEN_DUP_NUM;
1914      break;
1915    case '}':
1916      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1917	token->type = OP_CLOSE_DUP_NUM;
1918      break;
1919    case '(':
1920      if (syntax & RE_NO_BK_PARENS)
1921	token->type = OP_OPEN_SUBEXP;
1922      break;
1923    case ')':
1924      if (syntax & RE_NO_BK_PARENS)
1925	token->type = OP_CLOSE_SUBEXP;
1926      break;
1927    case '[':
1928      token->type = OP_OPEN_BRACKET;
1929      break;
1930    case '.':
1931      token->type = OP_PERIOD;
1932      break;
1933    case '^':
1934      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1935	  re_string_cur_idx (input) != 0)
1936	{
1937	  char prev = re_string_peek_byte (input, -1);
1938	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1939	    break;
1940	}
1941      token->type = ANCHOR;
1942      token->opr.ctx_type = LINE_FIRST;
1943      break;
1944    case '$':
1945      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1946	  re_string_cur_idx (input) + 1 != re_string_length (input))
1947	{
1948	  re_token_t next;
1949	  re_string_skip_bytes (input, 1);
1950	  peek_token (&next, input, syntax);
1951	  re_string_skip_bytes (input, -1);
1952	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1953	    break;
1954	}
1955      token->type = ANCHOR;
1956      token->opr.ctx_type = LINE_LAST;
1957      break;
1958    default:
1959      break;
1960    }
1961  return 1;
1962}
1963
1964/* Peek a token from INPUT, and return the length of the token.
1965   We must not use this function out of bracket expressions.  */
1966
1967static int
1968internal_function
1969peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1970{
1971  unsigned char c;
1972  if (re_string_eoi (input))
1973    {
1974      token->type = END_OF_RE;
1975      return 0;
1976    }
1977  c = re_string_peek_byte (input, 0);
1978  token->opr.c = c;
1979
1980#ifdef RE_ENABLE_I18N
1981  if (input->mb_cur_max > 1 &&
1982      !re_string_first_byte (input, re_string_cur_idx (input)))
1983    {
1984      token->type = CHARACTER;
1985      return 1;
1986    }
1987#endif /* RE_ENABLE_I18N */
1988
1989  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1990      && re_string_cur_idx (input) + 1 < re_string_length (input))
1991    {
1992      /* In this case, '\' escape a character.  */
1993      unsigned char c2;
1994      re_string_skip_bytes (input, 1);
1995      c2 = re_string_peek_byte (input, 0);
1996      token->opr.c = c2;
1997      token->type = CHARACTER;
1998      return 1;
1999    }
2000  if (c == '[') /* '[' is a special char in a bracket exps.  */
2001    {
2002      unsigned char c2;
2003      int token_len;
2004      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2005	c2 = re_string_peek_byte (input, 1);
2006      else
2007	c2 = 0;
2008      token->opr.c = c2;
2009      token_len = 2;
2010      switch (c2)
2011	{
2012	case '.':
2013	  token->type = OP_OPEN_COLL_ELEM;
2014	  break;
2015	case '=':
2016	  token->type = OP_OPEN_EQUIV_CLASS;
2017	  break;
2018	case ':':
2019	  if (syntax & RE_CHAR_CLASSES)
2020	    {
2021	      token->type = OP_OPEN_CHAR_CLASS;
2022	      break;
2023	    }
2024	  /* else fall through.  */
2025	default:
2026	  token->type = CHARACTER;
2027	  token->opr.c = c;
2028	  token_len = 1;
2029	  break;
2030	}
2031      return token_len;
2032    }
2033  switch (c)
2034    {
2035    case '-':
2036      token->type = OP_CHARSET_RANGE;
2037      break;
2038    case ']':
2039      token->type = OP_CLOSE_BRACKET;
2040      break;
2041    case '^':
2042      token->type = OP_NON_MATCH_LIST;
2043      break;
2044    default:
2045      token->type = CHARACTER;
2046    }
2047  return 1;
2048}
2049
2050/* Functions for parser.  */
2051
2052/* Entry point of the parser.
2053   Parse the regular expression REGEXP and return the structure tree.
2054   If an error is occured, ERR is set by error code, and return NULL.
2055   This function build the following tree, from regular expression <reg_exp>:
2056	   CAT
2057	   / \
2058	  /   \
2059   <reg_exp>  EOR
2060
2061   CAT means concatenation.
2062   EOR means end of regular expression.  */
2063
2064static bin_tree_t *
2065parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2066       reg_errcode_t *err)
2067{
2068  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2069  bin_tree_t *tree, *eor, *root;
2070  re_token_t current_token;
2071  dfa->syntax = syntax;
2072  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2073  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2074  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2075    return NULL;
2076  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2077  if (tree != NULL)
2078    root = create_tree (dfa, tree, eor, CONCAT);
2079  else
2080    root = eor;
2081  if (BE (eor == NULL || root == NULL, 0))
2082    {
2083      *err = REG_ESPACE;
2084      return NULL;
2085    }
2086  return root;
2087}
2088
2089/* This function build the following tree, from regular expression
2090   <branch1>|<branch2>:
2091	   ALT
2092	   / \
2093	  /   \
2094   <branch1> <branch2>
2095
2096   ALT means alternative, which represents the operator `|'.  */
2097
2098static bin_tree_t *
2099parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2100	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2101{
2102  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2103  bin_tree_t *tree, *branch = NULL;
2104  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2105  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2106    return NULL;
2107
2108  while (token->type == OP_ALT)
2109    {
2110      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2111      if (token->type != OP_ALT && token->type != END_OF_RE
2112	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2113	{
2114	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2115	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2116	    return NULL;
2117	}
2118      else
2119	branch = NULL;
2120      tree = create_tree (dfa, tree, branch, OP_ALT);
2121      if (BE (tree == NULL, 0))
2122	{
2123	  *err = REG_ESPACE;
2124	  return NULL;
2125	}
2126    }
2127  return tree;
2128}
2129
2130/* This function build the following tree, from regular expression
2131   <exp1><exp2>:
2132	CAT
2133	/ \
2134       /   \
2135   <exp1> <exp2>
2136
2137   CAT means concatenation.  */
2138
2139static bin_tree_t *
2140parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2141	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2142{
2143  bin_tree_t *tree, *exp;
2144  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2145  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2146  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147    return NULL;
2148
2149  while (token->type != OP_ALT && token->type != END_OF_RE
2150	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2151    {
2152      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2153      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2154	{
2155	  return NULL;
2156	}
2157      if (tree != NULL && exp != NULL)
2158	{
2159	  tree = create_tree (dfa, tree, exp, CONCAT);
2160	  if (tree == NULL)
2161	    {
2162	      *err = REG_ESPACE;
2163	      return NULL;
2164	    }
2165	}
2166      else if (tree == NULL)
2167	tree = exp;
2168      /* Otherwise exp == NULL, we don't need to create new tree.  */
2169    }
2170  return tree;
2171}
2172
2173/* This function build the following tree, from regular expression a*:
2174	 *
2175	 |
2176	 a
2177*/
2178
2179static bin_tree_t *
2180parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2181		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2182{
2183  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184  bin_tree_t *tree;
2185  switch (token->type)
2186    {
2187    case CHARACTER:
2188      tree = create_token_tree (dfa, NULL, NULL, token);
2189      if (BE (tree == NULL, 0))
2190	{
2191	  *err = REG_ESPACE;
2192	  return NULL;
2193	}
2194#ifdef RE_ENABLE_I18N
2195      if (dfa->mb_cur_max > 1)
2196	{
2197	  while (!re_string_eoi (regexp)
2198		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2199	    {
2200	      bin_tree_t *mbc_remain;
2201	      fetch_token (token, regexp, syntax);
2202	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2203	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2204	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2205		{
2206		  *err = REG_ESPACE;
2207		  return NULL;
2208		}
2209	    }
2210	}
2211#endif
2212      break;
2213    case OP_OPEN_SUBEXP:
2214      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2215      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2216	return NULL;
2217      break;
2218    case OP_OPEN_BRACKET:
2219      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2220      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221	return NULL;
2222      break;
2223    case OP_BACK_REF:
2224      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2225	{
2226	  *err = REG_ESUBREG;
2227	  return NULL;
2228	}
2229      dfa->used_bkref_map |= 1 << token->opr.idx;
2230      tree = create_token_tree (dfa, NULL, NULL, token);
2231      if (BE (tree == NULL, 0))
2232	{
2233	  *err = REG_ESPACE;
2234	  return NULL;
2235	}
2236      ++dfa->nbackref;
2237      dfa->has_mb_node = 1;
2238      break;
2239    case OP_OPEN_DUP_NUM:
2240      if (syntax & RE_CONTEXT_INVALID_DUP)
2241	{
2242	  *err = REG_BADRPT;
2243	  return NULL;
2244	}
2245      /* FALLTHROUGH */
2246    case OP_DUP_ASTERISK:
2247    case OP_DUP_PLUS:
2248    case OP_DUP_QUESTION:
2249      if (syntax & RE_CONTEXT_INVALID_OPS)
2250	{
2251	  *err = REG_BADRPT;
2252	  return NULL;
2253	}
2254      else if (syntax & RE_CONTEXT_INDEP_OPS)
2255	{
2256	  fetch_token (token, regexp, syntax);
2257	  return parse_expression (regexp, preg, token, syntax, nest, err);
2258	}
2259      /* else fall through  */
2260    case OP_CLOSE_SUBEXP:
2261      if ((token->type == OP_CLOSE_SUBEXP) &&
2262	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2263	{
2264	  *err = REG_ERPAREN;
2265	  return NULL;
2266	}
2267      /* else fall through  */
2268    case OP_CLOSE_DUP_NUM:
2269      /* We treat it as a normal character.  */
2270
2271      /* Then we can these characters as normal characters.  */
2272      token->type = CHARACTER;
2273      /* mb_partial and word_char bits should be initialized already
2274	 by peek_token.  */
2275      tree = create_token_tree (dfa, NULL, NULL, token);
2276      if (BE (tree == NULL, 0))
2277	{
2278	  *err = REG_ESPACE;
2279	  return NULL;
2280	}
2281      break;
2282    case ANCHOR:
2283      if ((token->opr.ctx_type
2284	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2285	  && dfa->word_ops_used == 0)
2286	init_word_char (dfa);
2287      if (token->opr.ctx_type == WORD_DELIM
2288          || token->opr.ctx_type == NOT_WORD_DELIM)
2289	{
2290	  bin_tree_t *tree_first, *tree_last;
2291	  if (token->opr.ctx_type == WORD_DELIM)
2292	    {
2293	      token->opr.ctx_type = WORD_FIRST;
2294	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2295	      token->opr.ctx_type = WORD_LAST;
2296            }
2297          else
2298            {
2299	      token->opr.ctx_type = INSIDE_WORD;
2300	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2301	      token->opr.ctx_type = INSIDE_NOTWORD;
2302            }
2303	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2304	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2305	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2306	    {
2307	      *err = REG_ESPACE;
2308	      return NULL;
2309	    }
2310	}
2311      else
2312	{
2313	  tree = create_token_tree (dfa, NULL, NULL, token);
2314	  if (BE (tree == NULL, 0))
2315	    {
2316	      *err = REG_ESPACE;
2317	      return NULL;
2318	    }
2319	}
2320      /* We must return here, since ANCHORs can't be followed
2321	 by repetition operators.
2322	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2323	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2324      fetch_token (token, regexp, syntax);
2325      return tree;
2326    case OP_PERIOD:
2327      tree = create_token_tree (dfa, NULL, NULL, token);
2328      if (BE (tree == NULL, 0))
2329	{
2330	  *err = REG_ESPACE;
2331	  return NULL;
2332	}
2333      if (dfa->mb_cur_max > 1)
2334	dfa->has_mb_node = 1;
2335      break;
2336    case OP_WORD:
2337    case OP_NOTWORD:
2338      tree = build_charclass_op (dfa, regexp->trans,
2339				 (const unsigned char *) "alnum",
2340				 (const unsigned char *) "_",
2341				 token->type == OP_NOTWORD, err);
2342      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2343	return NULL;
2344      break;
2345    case OP_SPACE:
2346    case OP_NOTSPACE:
2347      tree = build_charclass_op (dfa, regexp->trans,
2348				 (const unsigned char *) "space",
2349				 (const unsigned char *) "",
2350				 token->type == OP_NOTSPACE, err);
2351      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2352	return NULL;
2353      break;
2354    case OP_ALT:
2355    case END_OF_RE:
2356      return NULL;
2357    case BACK_SLASH:
2358      *err = REG_EESCAPE;
2359      return NULL;
2360    default:
2361      /* Must not happen?  */
2362#ifdef DEBUG
2363      assert (0);
2364#endif
2365      return NULL;
2366    }
2367  fetch_token (token, regexp, syntax);
2368
2369  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2370	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2371    {
2372      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2373      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2374	return NULL;
2375      /* In BRE consecutive duplications are not allowed.  */
2376      if ((syntax & RE_CONTEXT_INVALID_DUP)
2377	  && (token->type == OP_DUP_ASTERISK
2378	      || token->type == OP_OPEN_DUP_NUM))
2379	{
2380	  *err = REG_BADRPT;
2381	  return NULL;
2382	}
2383    }
2384
2385  return tree;
2386}
2387
2388/* This function build the following tree, from regular expression
2389   (<reg_exp>):
2390	 SUBEXP
2391	    |
2392	<reg_exp>
2393*/
2394
2395static bin_tree_t *
2396parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2397	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2398{
2399  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2400  bin_tree_t *tree;
2401  size_t cur_nsub;
2402  cur_nsub = preg->re_nsub++;
2403
2404  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2405
2406  /* The subexpression may be a null string.  */
2407  if (token->type == OP_CLOSE_SUBEXP)
2408    tree = NULL;
2409  else
2410    {
2411      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2412      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2413        *err = REG_EPAREN;
2414      if (BE (*err != REG_NOERROR, 0))
2415	return NULL;
2416    }
2417
2418  if (cur_nsub <= '9' - '1')
2419    dfa->completed_bkref_map |= 1 << cur_nsub;
2420
2421  tree = create_tree (dfa, tree, NULL, SUBEXP);
2422  if (BE (tree == NULL, 0))
2423    {
2424      *err = REG_ESPACE;
2425      return NULL;
2426    }
2427  tree->token.opr.idx = cur_nsub;
2428  return tree;
2429}
2430
2431/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2432
2433static bin_tree_t *
2434parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2435	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2436{
2437  bin_tree_t *tree = NULL, *old_tree = NULL;
2438  int i, start, end, start_idx = re_string_cur_idx (regexp);
2439  re_token_t start_token = *token;
2440
2441  if (token->type == OP_OPEN_DUP_NUM)
2442    {
2443      end = 0;
2444      start = fetch_number (regexp, token, syntax);
2445      if (start == -1)
2446	{
2447	  if (token->type == CHARACTER && token->opr.c == ',')
2448	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2449	  else
2450	    {
2451	      *err = REG_BADBR; /* <re>{} is invalid.  */
2452	      return NULL;
2453	    }
2454	}
2455      if (BE (start != -2, 1))
2456	{
2457	  /* We treat "{n}" as "{n,n}".  */
2458	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2459		 : ((token->type == CHARACTER && token->opr.c == ',')
2460		    ? fetch_number (regexp, token, syntax) : -2));
2461	}
2462      if (BE (start == -2 || end == -2, 0))
2463	{
2464	  /* Invalid sequence.  */
2465	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2466	    {
2467	      if (token->type == END_OF_RE)
2468		*err = REG_EBRACE;
2469	      else
2470		*err = REG_BADBR;
2471
2472	      return NULL;
2473	    }
2474
2475	  /* If the syntax bit is set, rollback.  */
2476	  re_string_set_index (regexp, start_idx);
2477	  *token = start_token;
2478	  token->type = CHARACTER;
2479	  /* mb_partial and word_char bits should be already initialized by
2480	     peek_token.  */
2481	  return elem;
2482	}
2483
2484      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2485	{
2486	  /* First number greater than second.  */
2487	  *err = REG_BADBR;
2488	  return NULL;
2489	}
2490    }
2491  else
2492    {
2493      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2494      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2495    }
2496
2497  fetch_token (token, regexp, syntax);
2498
2499  if (BE (elem == NULL, 0))
2500    return NULL;
2501  if (BE (start == 0 && end == 0, 0))
2502    {
2503      postorder (elem, free_tree, NULL);
2504      return NULL;
2505    }
2506
2507  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2508  if (BE (start > 0, 0))
2509    {
2510      tree = elem;
2511      for (i = 2; i <= start; ++i)
2512	{
2513	  elem = duplicate_tree (elem, dfa);
2514	  tree = create_tree (dfa, tree, elem, CONCAT);
2515	  if (BE (elem == NULL || tree == NULL, 0))
2516	    goto parse_dup_op_espace;
2517	}
2518
2519      if (start == end)
2520	return tree;
2521
2522      /* Duplicate ELEM before it is marked optional.  */
2523      elem = duplicate_tree (elem, dfa);
2524      old_tree = tree;
2525    }
2526  else
2527    old_tree = NULL;
2528
2529  if (elem->token.type == SUBEXP)
2530    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2531
2532  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2533  if (BE (tree == NULL, 0))
2534    goto parse_dup_op_espace;
2535
2536  /* This loop is actually executed only when end != -1,
2537     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2538     already created the start+1-th copy.  */
2539  for (i = start + 2; i <= end; ++i)
2540    {
2541      elem = duplicate_tree (elem, dfa);
2542      tree = create_tree (dfa, tree, elem, CONCAT);
2543      if (BE (elem == NULL || tree == NULL, 0))
2544        goto parse_dup_op_espace;
2545
2546      tree = create_tree (dfa, tree, NULL, OP_ALT);
2547      if (BE (tree == NULL, 0))
2548        goto parse_dup_op_espace;
2549    }
2550
2551  if (old_tree)
2552    tree = create_tree (dfa, old_tree, tree, CONCAT);
2553
2554  return tree;
2555
2556 parse_dup_op_espace:
2557  *err = REG_ESPACE;
2558  return NULL;
2559}
2560
2561/* Size of the names for collating symbol/equivalence_class/character_class.
2562   I'm not sure, but maybe enough.  */
2563#define BRACKET_NAME_BUF_SIZE 32
2564
2565#ifndef _LIBC
2566  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2567     Build the range expression which starts from START_ELEM, and ends
2568     at END_ELEM.  The result are written to MBCSET and SBCSET.
2569     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2570     mbcset->range_ends, is a pointer argument sinse we may
2571     update it.  */
2572
2573static reg_errcode_t
2574internal_function
2575# ifdef RE_ENABLE_I18N
2576build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2577		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2578# else /* not RE_ENABLE_I18N */
2579build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2580		 bracket_elem_t *end_elem)
2581# endif /* not RE_ENABLE_I18N */
2582{
2583  unsigned int start_ch, end_ch;
2584  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2585  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2586	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2587	  0))
2588    return REG_ERANGE;
2589
2590  /* We can handle no multi character collating elements without libc
2591     support.  */
2592  if (BE ((start_elem->type == COLL_SYM
2593	   && strlen ((char *) start_elem->opr.name) > 1)
2594	  || (end_elem->type == COLL_SYM
2595	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2596    return REG_ECOLLATE;
2597
2598# ifdef RE_ENABLE_I18N
2599  {
2600    wchar_t wc;
2601    wint_t start_wc;
2602    wint_t end_wc;
2603    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2604
2605    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2606		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2607		   : 0));
2608    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2609	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2610		 : 0));
2611    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2612		? __btowc (start_ch) : start_elem->opr.wch);
2613    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2614	      ? __btowc (end_ch) : end_elem->opr.wch);
2615    if (start_wc == WEOF || end_wc == WEOF)
2616      return REG_ECOLLATE;
2617    cmp_buf[0] = start_wc;
2618    cmp_buf[4] = end_wc;
2619    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2620      return REG_ERANGE;
2621
2622    /* Got valid collation sequence values, add them as a new entry.
2623       However, for !_LIBC we have no collation elements: if the
2624       character set is single byte, the single byte character set
2625       that we build below suffices.  parse_bracket_exp passes
2626       no MBCSET if dfa->mb_cur_max == 1.  */
2627    if (mbcset)
2628      {
2629        /* Check the space of the arrays.  */
2630        if (BE (*range_alloc == mbcset->nranges, 0))
2631          {
2632	    /* There is not enough space, need realloc.  */
2633	    wchar_t *new_array_start, *new_array_end;
2634	    int new_nranges;
2635
2636	    /* +1 in case of mbcset->nranges is 0.  */
2637	    new_nranges = 2 * mbcset->nranges + 1;
2638	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2639	       are NULL if *range_alloc == 0.  */
2640	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2641				          new_nranges);
2642	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2643				        new_nranges);
2644
2645	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2646	      return REG_ESPACE;
2647
2648	    mbcset->range_starts = new_array_start;
2649	    mbcset->range_ends = new_array_end;
2650	    *range_alloc = new_nranges;
2651          }
2652
2653        mbcset->range_starts[mbcset->nranges] = start_wc;
2654        mbcset->range_ends[mbcset->nranges++] = end_wc;
2655      }
2656
2657    /* Build the table for single byte characters.  */
2658    for (wc = 0; wc < SBC_MAX; ++wc)
2659      {
2660	cmp_buf[2] = wc;
2661	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2662	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2663	  bitset_set (sbcset, wc);
2664      }
2665  }
2666# else /* not RE_ENABLE_I18N */
2667  {
2668    unsigned int ch;
2669    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2670		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2671		   : 0));
2672    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2673	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2674		 : 0));
2675    if (start_ch > end_ch)
2676      return REG_ERANGE;
2677    /* Build the table for single byte characters.  */
2678    for (ch = 0; ch < SBC_MAX; ++ch)
2679      if (start_ch <= ch  && ch <= end_ch)
2680	bitset_set (sbcset, ch);
2681  }
2682# endif /* not RE_ENABLE_I18N */
2683  return REG_NOERROR;
2684}
2685#endif /* not _LIBC */
2686
2687#ifndef _LIBC
2688/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2689   Build the collating element which is represented by NAME.
2690   The result are written to MBCSET and SBCSET.
2691   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2692   pointer argument since we may update it.  */
2693
2694static reg_errcode_t
2695internal_function
2696# ifdef RE_ENABLE_I18N
2697build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2698			int *coll_sym_alloc, const unsigned char *name)
2699# else /* not RE_ENABLE_I18N */
2700build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2701# endif /* not RE_ENABLE_I18N */
2702{
2703  size_t name_len = strlen ((const char *) name);
2704  if (BE (name_len != 1, 0))
2705    return REG_ECOLLATE;
2706  else
2707    {
2708      bitset_set (sbcset, name[0]);
2709      return REG_NOERROR;
2710    }
2711}
2712#endif /* not _LIBC */
2713
2714/* This function parse bracket expression like "[abc]", "[a-c]",
2715   "[[.a-a.]]" etc.  */
2716
2717static bin_tree_t *
2718parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2719		   reg_syntax_t syntax, reg_errcode_t *err)
2720{
2721#ifdef _LIBC
2722  const unsigned char *collseqmb;
2723  const char *collseqwc;
2724  uint32_t nrules;
2725  int32_t table_size;
2726  const int32_t *symb_table;
2727  const unsigned char *extra;
2728
2729  /* Local function for parse_bracket_exp used in _LIBC environement.
2730     Seek the collating symbol entry correspondings to NAME.
2731     Return the index of the symbol in the SYMB_TABLE.  */
2732
2733  auto inline int32_t
2734  __attribute ((always_inline))
2735  seek_collating_symbol_entry (name, name_len)
2736	 const unsigned char *name;
2737	 size_t name_len;
2738    {
2739      int32_t hash = elem_hash ((const char *) name, name_len);
2740      int32_t elem = hash % table_size;
2741      if (symb_table[2 * elem] != 0)
2742	{
2743	  int32_t second = hash % (table_size - 2) + 1;
2744
2745	  do
2746	    {
2747	      /* First compare the hashing value.  */
2748	      if (symb_table[2 * elem] == hash
2749		  /* Compare the length of the name.  */
2750		  && name_len == extra[symb_table[2 * elem + 1]]
2751		  /* Compare the name.  */
2752		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2753			     name_len) == 0)
2754		{
2755		  /* Yep, this is the entry.  */
2756		  break;
2757		}
2758
2759	      /* Next entry.  */
2760	      elem += second;
2761	    }
2762	  while (symb_table[2 * elem] != 0);
2763	}
2764      return elem;
2765    }
2766
2767  /* Local function for parse_bracket_exp used in _LIBC environment.
2768     Look up the collation sequence value of BR_ELEM.
2769     Return the value if succeeded, UINT_MAX otherwise.  */
2770
2771  auto inline unsigned int
2772  __attribute ((always_inline))
2773  lookup_collation_sequence_value (br_elem)
2774	 bracket_elem_t *br_elem;
2775    {
2776      if (br_elem->type == SB_CHAR)
2777	{
2778	  /*
2779	  if (MB_CUR_MAX == 1)
2780	  */
2781	  if (nrules == 0)
2782	    return collseqmb[br_elem->opr.ch];
2783	  else
2784	    {
2785	      wint_t wc = __btowc (br_elem->opr.ch);
2786	      return __collseq_table_lookup (collseqwc, wc);
2787	    }
2788	}
2789      else if (br_elem->type == MB_CHAR)
2790	{
2791	  if (nrules != 0)
2792	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2793	}
2794      else if (br_elem->type == COLL_SYM)
2795	{
2796	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2797	  if (nrules != 0)
2798	    {
2799	      int32_t elem, idx;
2800	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2801						  sym_name_len);
2802	      if (symb_table[2 * elem] != 0)
2803		{
2804		  /* We found the entry.  */
2805		  idx = symb_table[2 * elem + 1];
2806		  /* Skip the name of collating element name.  */
2807		  idx += 1 + extra[idx];
2808		  /* Skip the byte sequence of the collating element.  */
2809		  idx += 1 + extra[idx];
2810		  /* Adjust for the alignment.  */
2811		  idx = (idx + 3) & ~3;
2812		  /* Skip the multibyte collation sequence value.  */
2813		  idx += sizeof (unsigned int);
2814		  /* Skip the wide char sequence of the collating element.  */
2815		  idx += sizeof (unsigned int) *
2816		    (1 + *(unsigned int *) (extra + idx));
2817		  /* Return the collation sequence value.  */
2818		  return *(unsigned int *) (extra + idx);
2819		}
2820	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2821		{
2822		  /* No valid character.  Match it as a single byte
2823		     character.  */
2824		  return collseqmb[br_elem->opr.name[0]];
2825		}
2826	    }
2827	  else if (sym_name_len == 1)
2828	    return collseqmb[br_elem->opr.name[0]];
2829	}
2830      return UINT_MAX;
2831    }
2832
2833  /* Local function for parse_bracket_exp used in _LIBC environement.
2834     Build the range expression which starts from START_ELEM, and ends
2835     at END_ELEM.  The result are written to MBCSET and SBCSET.
2836     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2837     mbcset->range_ends, is a pointer argument sinse we may
2838     update it.  */
2839
2840  auto inline reg_errcode_t
2841  __attribute ((always_inline))
2842  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2843	 re_charset_t *mbcset;
2844	 int *range_alloc;
2845	 bitset_t sbcset;
2846	 bracket_elem_t *start_elem, *end_elem;
2847    {
2848      unsigned int ch;
2849      uint32_t start_collseq;
2850      uint32_t end_collseq;
2851
2852      /* Equivalence Classes and Character Classes can't be a range
2853	 start/end.  */
2854      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2855	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2856	      0))
2857	return REG_ERANGE;
2858
2859      start_collseq = lookup_collation_sequence_value (start_elem);
2860      end_collseq = lookup_collation_sequence_value (end_elem);
2861      /* Check start/end collation sequence values.  */
2862      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2863	return REG_ECOLLATE;
2864      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2865	return REG_ERANGE;
2866
2867      /* Got valid collation sequence values, add them as a new entry.
2868	 However, if we have no collation elements, and the character set
2869	 is single byte, the single byte character set that we
2870	 build below suffices. */
2871      if (nrules > 0 || dfa->mb_cur_max > 1)
2872	{
2873          /* Check the space of the arrays.  */
2874          if (BE (*range_alloc == mbcset->nranges, 0))
2875	    {
2876	      /* There is not enough space, need realloc.  */
2877	      uint32_t *new_array_start;
2878	      uint32_t *new_array_end;
2879	      int new_nranges;
2880
2881	      /* +1 in case of mbcset->nranges is 0.  */
2882	      new_nranges = 2 * mbcset->nranges + 1;
2883	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2884					    new_nranges);
2885	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2886				          new_nranges);
2887
2888	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2889	        return REG_ESPACE;
2890
2891	      mbcset->range_starts = new_array_start;
2892	      mbcset->range_ends = new_array_end;
2893	      *range_alloc = new_nranges;
2894	    }
2895
2896          mbcset->range_starts[mbcset->nranges] = start_collseq;
2897          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2898	}
2899
2900      /* Build the table for single byte characters.  */
2901      for (ch = 0; ch < SBC_MAX; ch++)
2902	{
2903	  uint32_t ch_collseq;
2904	  /*
2905	  if (MB_CUR_MAX == 1)
2906	  */
2907	  if (nrules == 0)
2908	    ch_collseq = collseqmb[ch];
2909	  else
2910	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2911	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2912	    bitset_set (sbcset, ch);
2913	}
2914      return REG_NOERROR;
2915    }
2916
2917  /* Local function for parse_bracket_exp used in _LIBC environement.
2918     Build the collating element which is represented by NAME.
2919     The result are written to MBCSET and SBCSET.
2920     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2921     pointer argument sinse we may update it.  */
2922
2923  auto inline reg_errcode_t
2924  __attribute ((always_inline))
2925  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2926	 re_charset_t *mbcset;
2927	 int *coll_sym_alloc;
2928	 bitset_t sbcset;
2929	 const unsigned char *name;
2930    {
2931      int32_t elem, idx;
2932      size_t name_len = strlen ((const char *) name);
2933      if (nrules != 0)
2934	{
2935	  elem = seek_collating_symbol_entry (name, name_len);
2936	  if (symb_table[2 * elem] != 0)
2937	    {
2938	      /* We found the entry.  */
2939	      idx = symb_table[2 * elem + 1];
2940	      /* Skip the name of collating element name.  */
2941	      idx += 1 + extra[idx];
2942	    }
2943	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2944	    {
2945	      /* No valid character, treat it as a normal
2946		 character.  */
2947	      bitset_set (sbcset, name[0]);
2948	      return REG_NOERROR;
2949	    }
2950	  else
2951	    return REG_ECOLLATE;
2952
2953	  /* Got valid collation sequence, add it as a new entry.  */
2954	  /* Check the space of the arrays.  */
2955	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2956	    {
2957	      /* Not enough, realloc it.  */
2958	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2959	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2960	      /* Use realloc since mbcset->coll_syms is NULL
2961		 if *alloc == 0.  */
2962	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2963						   new_coll_sym_alloc);
2964	      if (BE (new_coll_syms == NULL, 0))
2965		return REG_ESPACE;
2966	      mbcset->coll_syms = new_coll_syms;
2967	      *coll_sym_alloc = new_coll_sym_alloc;
2968	    }
2969	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2970	  return REG_NOERROR;
2971	}
2972      else
2973	{
2974	  if (BE (name_len != 1, 0))
2975	    return REG_ECOLLATE;
2976	  else
2977	    {
2978	      bitset_set (sbcset, name[0]);
2979	      return REG_NOERROR;
2980	    }
2981	}
2982    }
2983#endif
2984
2985  re_token_t br_token;
2986  re_bitset_ptr_t sbcset;
2987#ifdef RE_ENABLE_I18N
2988  re_charset_t *mbcset;
2989  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2990  int equiv_class_alloc = 0, char_class_alloc = 0;
2991#endif /* not RE_ENABLE_I18N */
2992  int non_match = 0;
2993  bin_tree_t *work_tree;
2994  int token_len;
2995  int first_round = 1;
2996#ifdef _LIBC
2997  collseqmb = (const unsigned char *)
2998    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2999  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3000  if (nrules)
3001    {
3002      /*
3003      if (MB_CUR_MAX > 1)
3004      */
3005      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3006      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3007      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3008						  _NL_COLLATE_SYMB_TABLEMB);
3009      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3010						   _NL_COLLATE_SYMB_EXTRAMB);
3011    }
3012#endif
3013  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3014#ifdef RE_ENABLE_I18N
3015  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3016#endif /* RE_ENABLE_I18N */
3017#ifdef RE_ENABLE_I18N
3018  if (BE (sbcset == NULL || mbcset == NULL, 0))
3019#else
3020  if (BE (sbcset == NULL, 0))
3021#endif /* RE_ENABLE_I18N */
3022    {
3023      *err = REG_ESPACE;
3024      return NULL;
3025    }
3026
3027  token_len = peek_token_bracket (token, regexp, syntax);
3028  if (BE (token->type == END_OF_RE, 0))
3029    {
3030      *err = REG_BADPAT;
3031      goto parse_bracket_exp_free_return;
3032    }
3033  if (token->type == OP_NON_MATCH_LIST)
3034    {
3035#ifdef RE_ENABLE_I18N
3036      mbcset->non_match = 1;
3037#endif /* not RE_ENABLE_I18N */
3038      non_match = 1;
3039      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3040	bitset_set (sbcset, '\n');
3041      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3042      token_len = peek_token_bracket (token, regexp, syntax);
3043      if (BE (token->type == END_OF_RE, 0))
3044	{
3045	  *err = REG_BADPAT;
3046	  goto parse_bracket_exp_free_return;
3047	}
3048    }
3049
3050  /* We treat the first ']' as a normal character.  */
3051  if (token->type == OP_CLOSE_BRACKET)
3052    token->type = CHARACTER;
3053
3054  while (1)
3055    {
3056      bracket_elem_t start_elem, end_elem;
3057      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3058      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3059      reg_errcode_t ret;
3060      int token_len2 = 0, is_range_exp = 0;
3061      re_token_t token2;
3062
3063      start_elem.opr.name = start_name_buf;
3064      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3065				   syntax, first_round);
3066      if (BE (ret != REG_NOERROR, 0))
3067	{
3068	  *err = ret;
3069	  goto parse_bracket_exp_free_return;
3070	}
3071      first_round = 0;
3072
3073      /* Get information about the next token.  We need it in any case.  */
3074      token_len = peek_token_bracket (token, regexp, syntax);
3075
3076      /* Do not check for ranges if we know they are not allowed.  */
3077      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3078	{
3079	  if (BE (token->type == END_OF_RE, 0))
3080	    {
3081	      *err = REG_EBRACK;
3082	      goto parse_bracket_exp_free_return;
3083	    }
3084	  if (token->type == OP_CHARSET_RANGE)
3085	    {
3086	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3087	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3088	      if (BE (token2.type == END_OF_RE, 0))
3089		{
3090		  *err = REG_EBRACK;
3091		  goto parse_bracket_exp_free_return;
3092		}
3093	      if (token2.type == OP_CLOSE_BRACKET)
3094		{
3095		  /* We treat the last '-' as a normal character.  */
3096		  re_string_skip_bytes (regexp, -token_len);
3097		  token->type = CHARACTER;
3098		}
3099	      else
3100		is_range_exp = 1;
3101	    }
3102	}
3103
3104      if (is_range_exp == 1)
3105	{
3106	  end_elem.opr.name = end_name_buf;
3107	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3108				       dfa, syntax, 1);
3109	  if (BE (ret != REG_NOERROR, 0))
3110	    {
3111	      *err = ret;
3112	      goto parse_bracket_exp_free_return;
3113	    }
3114
3115	  token_len = peek_token_bracket (token, regexp, syntax);
3116
3117#ifdef _LIBC
3118	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3119				  &start_elem, &end_elem);
3120#else
3121# ifdef RE_ENABLE_I18N
3122	  *err = build_range_exp (sbcset,
3123				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3124				  &range_alloc, &start_elem, &end_elem);
3125# else
3126	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3127# endif
3128#endif /* RE_ENABLE_I18N */
3129	  if (BE (*err != REG_NOERROR, 0))
3130	    goto parse_bracket_exp_free_return;
3131	}
3132      else
3133	{
3134	  switch (start_elem.type)
3135	    {
3136	    case SB_CHAR:
3137	      bitset_set (sbcset, start_elem.opr.ch);
3138	      break;
3139#ifdef RE_ENABLE_I18N
3140	    case MB_CHAR:
3141	      /* Check whether the array has enough space.  */
3142	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3143		{
3144		  wchar_t *new_mbchars;
3145		  /* Not enough, realloc it.  */
3146		  /* +1 in case of mbcset->nmbchars is 0.  */
3147		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3148		  /* Use realloc since array is NULL if *alloc == 0.  */
3149		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3150					    mbchar_alloc);
3151		  if (BE (new_mbchars == NULL, 0))
3152		    goto parse_bracket_exp_espace;
3153		  mbcset->mbchars = new_mbchars;
3154		}
3155	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3156	      break;
3157#endif /* RE_ENABLE_I18N */
3158	    case EQUIV_CLASS:
3159	      *err = build_equiv_class (sbcset,
3160#ifdef RE_ENABLE_I18N
3161					mbcset, &equiv_class_alloc,
3162#endif /* RE_ENABLE_I18N */
3163					start_elem.opr.name);
3164	      if (BE (*err != REG_NOERROR, 0))
3165		goto parse_bracket_exp_free_return;
3166	      break;
3167	    case COLL_SYM:
3168	      *err = build_collating_symbol (sbcset,
3169#ifdef RE_ENABLE_I18N
3170					     mbcset, &coll_sym_alloc,
3171#endif /* RE_ENABLE_I18N */
3172					     start_elem.opr.name);
3173	      if (BE (*err != REG_NOERROR, 0))
3174		goto parse_bracket_exp_free_return;
3175	      break;
3176	    case CHAR_CLASS:
3177	      *err = build_charclass (regexp->trans, sbcset,
3178#ifdef RE_ENABLE_I18N
3179				      mbcset, &char_class_alloc,
3180#endif /* RE_ENABLE_I18N */
3181				      start_elem.opr.name, syntax);
3182	      if (BE (*err != REG_NOERROR, 0))
3183	       goto parse_bracket_exp_free_return;
3184	      break;
3185	    default:
3186	      assert (0);
3187	      break;
3188	    }
3189	}
3190      if (BE (token->type == END_OF_RE, 0))
3191	{
3192	  *err = REG_EBRACK;
3193	  goto parse_bracket_exp_free_return;
3194	}
3195      if (token->type == OP_CLOSE_BRACKET)
3196	break;
3197    }
3198
3199  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3200
3201  /* If it is non-matching list.  */
3202  if (non_match)
3203    bitset_not (sbcset);
3204
3205#ifdef RE_ENABLE_I18N
3206  /* Ensure only single byte characters are set.  */
3207  if (dfa->mb_cur_max > 1)
3208    bitset_mask (sbcset, dfa->sb_char);
3209
3210  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3211      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3212						     || mbcset->non_match)))
3213    {
3214      bin_tree_t *mbc_tree;
3215      int sbc_idx;
3216      /* Build a tree for complex bracket.  */
3217      dfa->has_mb_node = 1;
3218      br_token.type = COMPLEX_BRACKET;
3219      br_token.opr.mbcset = mbcset;
3220      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3221      if (BE (mbc_tree == NULL, 0))
3222	goto parse_bracket_exp_espace;
3223      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3224	if (sbcset[sbc_idx])
3225	  break;
3226      /* If there are no bits set in sbcset, there is no point
3227	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3228      if (sbc_idx < BITSET_WORDS)
3229	{
3230          /* Build a tree for simple bracket.  */
3231          br_token.type = SIMPLE_BRACKET;
3232          br_token.opr.sbcset = sbcset;
3233          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3234          if (BE (work_tree == NULL, 0))
3235            goto parse_bracket_exp_espace;
3236
3237          /* Then join them by ALT node.  */
3238          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3239          if (BE (work_tree == NULL, 0))
3240            goto parse_bracket_exp_espace;
3241	}
3242      else
3243	{
3244	  re_free (sbcset);
3245	  work_tree = mbc_tree;
3246	}
3247    }
3248  else
3249#endif /* not RE_ENABLE_I18N */
3250    {
3251#ifdef RE_ENABLE_I18N
3252      free_charset (mbcset);
3253#endif
3254      /* Build a tree for simple bracket.  */
3255      br_token.type = SIMPLE_BRACKET;
3256      br_token.opr.sbcset = sbcset;
3257      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3258      if (BE (work_tree == NULL, 0))
3259        goto parse_bracket_exp_espace;
3260    }
3261  return work_tree;
3262
3263 parse_bracket_exp_espace:
3264  *err = REG_ESPACE;
3265 parse_bracket_exp_free_return:
3266  re_free (sbcset);
3267#ifdef RE_ENABLE_I18N
3268  free_charset (mbcset);
3269#endif /* RE_ENABLE_I18N */
3270  return NULL;
3271}
3272
3273/* Parse an element in the bracket expression.  */
3274
3275static reg_errcode_t
3276parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3277		       re_token_t *token, int token_len, re_dfa_t *dfa,
3278		       reg_syntax_t syntax, int accept_hyphen)
3279{
3280#ifdef RE_ENABLE_I18N
3281  int cur_char_size;
3282  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3283  if (cur_char_size > 1)
3284    {
3285      elem->type = MB_CHAR;
3286      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3287      re_string_skip_bytes (regexp, cur_char_size);
3288      return REG_NOERROR;
3289    }
3290#endif /* RE_ENABLE_I18N */
3291  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3292  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3293      || token->type == OP_OPEN_EQUIV_CLASS)
3294    return parse_bracket_symbol (elem, regexp, token);
3295  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3296    {
3297      /* A '-' must only appear as anything but a range indicator before
3298	 the closing bracket.  Everything else is an error.  */
3299      re_token_t token2;
3300      (void) peek_token_bracket (&token2, regexp, syntax);
3301      if (token2.type != OP_CLOSE_BRACKET)
3302	/* The actual error value is not standardized since this whole
3303	   case is undefined.  But ERANGE makes good sense.  */
3304	return REG_ERANGE;
3305    }
3306  elem->type = SB_CHAR;
3307  elem->opr.ch = token->opr.c;
3308  return REG_NOERROR;
3309}
3310
3311/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3312   such as [:<character_class>:], [.<collating_element>.], and
3313   [=<equivalent_class>=].  */
3314
3315static reg_errcode_t
3316parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3317		      re_token_t *token)
3318{
3319  unsigned char ch, delim = token->opr.c;
3320  int i = 0;
3321  if (re_string_eoi(regexp))
3322    return REG_EBRACK;
3323  for (;; ++i)
3324    {
3325      if (i >= BRACKET_NAME_BUF_SIZE)
3326	return REG_EBRACK;
3327      if (token->type == OP_OPEN_CHAR_CLASS)
3328	ch = re_string_fetch_byte_case (regexp);
3329      else
3330	ch = re_string_fetch_byte (regexp);
3331      if (re_string_eoi(regexp))
3332	return REG_EBRACK;
3333      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3334	break;
3335      elem->opr.name[i] = ch;
3336    }
3337  re_string_skip_bytes (regexp, 1);
3338  elem->opr.name[i] = '\0';
3339  switch (token->type)
3340    {
3341    case OP_OPEN_COLL_ELEM:
3342      elem->type = COLL_SYM;
3343      break;
3344    case OP_OPEN_EQUIV_CLASS:
3345      elem->type = EQUIV_CLASS;
3346      break;
3347    case OP_OPEN_CHAR_CLASS:
3348      elem->type = CHAR_CLASS;
3349      break;
3350    default:
3351      break;
3352    }
3353  return REG_NOERROR;
3354}
3355
3356  /* Helper function for parse_bracket_exp.
3357     Build the equivalence class which is represented by NAME.
3358     The result are written to MBCSET and SBCSET.
3359     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3360     is a pointer argument sinse we may update it.  */
3361
3362static reg_errcode_t
3363#ifdef RE_ENABLE_I18N
3364build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3365		   int *equiv_class_alloc, const unsigned char *name)
3366#else /* not RE_ENABLE_I18N */
3367build_equiv_class (bitset_t sbcset, const unsigned char *name)
3368#endif /* not RE_ENABLE_I18N */
3369{
3370#ifdef _LIBC
3371  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3372  if (nrules != 0)
3373    {
3374      const int32_t *table, *indirect;
3375      const unsigned char *weights, *extra, *cp;
3376      unsigned char char_buf[2];
3377      int32_t idx1, idx2;
3378      unsigned int ch;
3379      size_t len;
3380      /* This #include defines a local function!  */
3381# include <locale/weight.h>
3382      /* Calculate the index for equivalence class.  */
3383      cp = name;
3384      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3385      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3386					       _NL_COLLATE_WEIGHTMB);
3387      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3388						   _NL_COLLATE_EXTRAMB);
3389      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3390						_NL_COLLATE_INDIRECTMB);
3391      idx1 = findidx (&cp);
3392      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3393	/* This isn't a valid character.  */
3394	return REG_ECOLLATE;
3395
3396      /* Build single byte matcing table for this equivalence class.  */
3397      char_buf[1] = (unsigned char) '\0';
3398      len = weights[idx1 & 0xffffff];
3399      for (ch = 0; ch < SBC_MAX; ++ch)
3400	{
3401	  char_buf[0] = ch;
3402	  cp = char_buf;
3403	  idx2 = findidx (&cp);
3404/*
3405	  idx2 = table[ch];
3406*/
3407	  if (idx2 == 0)
3408	    /* This isn't a valid character.  */
3409	    continue;
3410	  /* Compare only if the length matches and the collation rule
3411	     index is the same.  */
3412	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3413	    {
3414	      int cnt = 0;
3415
3416	      while (cnt <= len &&
3417		     weights[(idx1 & 0xffffff) + 1 + cnt]
3418		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3419		++cnt;
3420
3421	      if (cnt > len)
3422		bitset_set (sbcset, ch);
3423	    }
3424	}
3425      /* Check whether the array has enough space.  */
3426      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3427	{
3428	  /* Not enough, realloc it.  */
3429	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3430	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3431	  /* Use realloc since the array is NULL if *alloc == 0.  */
3432	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3433						   int32_t,
3434						   new_equiv_class_alloc);
3435	  if (BE (new_equiv_classes == NULL, 0))
3436	    return REG_ESPACE;
3437	  mbcset->equiv_classes = new_equiv_classes;
3438	  *equiv_class_alloc = new_equiv_class_alloc;
3439	}
3440      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3441    }
3442  else
3443#endif /* _LIBC */
3444    {
3445      if (BE (strlen ((const char *) name) != 1, 0))
3446	return REG_ECOLLATE;
3447      bitset_set (sbcset, *name);
3448    }
3449  return REG_NOERROR;
3450}
3451
3452  /* Helper function for parse_bracket_exp.
3453     Build the character class which is represented by NAME.
3454     The result are written to MBCSET and SBCSET.
3455     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3456     is a pointer argument sinse we may update it.  */
3457
3458static reg_errcode_t
3459#ifdef RE_ENABLE_I18N
3460build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3461		 re_charset_t *mbcset, int *char_class_alloc,
3462		 const unsigned char *class_name, reg_syntax_t syntax)
3463#else /* not RE_ENABLE_I18N */
3464build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3465		 const unsigned char *class_name, reg_syntax_t syntax)
3466#endif /* not RE_ENABLE_I18N */
3467{
3468  int i;
3469  const char *name = (const char *) class_name;
3470
3471  /* In case of REG_ICASE "upper" and "lower" match the both of
3472     upper and lower cases.  */
3473  if ((syntax & RE_ICASE)
3474      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3475    name = "alpha";
3476
3477#ifdef RE_ENABLE_I18N
3478  /* Check the space of the arrays.  */
3479  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3480    {
3481      /* Not enough, realloc it.  */
3482      /* +1 in case of mbcset->nchar_classes is 0.  */
3483      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3484      /* Use realloc since array is NULL if *alloc == 0.  */
3485      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3486					       new_char_class_alloc);
3487      if (BE (new_char_classes == NULL, 0))
3488	return REG_ESPACE;
3489      mbcset->char_classes = new_char_classes;
3490      *char_class_alloc = new_char_class_alloc;
3491    }
3492  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3493#endif /* RE_ENABLE_I18N */
3494
3495#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3496  do {						\
3497    if (BE (trans != NULL, 0))			\
3498      {						\
3499	for (i = 0; i < SBC_MAX; ++i)		\
3500  	  if (ctype_func (i))			\
3501	    bitset_set (sbcset, trans[i]);	\
3502      }						\
3503    else					\
3504      {						\
3505	for (i = 0; i < SBC_MAX; ++i)		\
3506  	  if (ctype_func (i))			\
3507	    bitset_set (sbcset, i);		\
3508      }						\
3509  } while (0)
3510
3511  if (strcmp (name, "alnum") == 0)
3512    BUILD_CHARCLASS_LOOP (isalnum);
3513  else if (strcmp (name, "cntrl") == 0)
3514    BUILD_CHARCLASS_LOOP (iscntrl);
3515  else if (strcmp (name, "lower") == 0)
3516    BUILD_CHARCLASS_LOOP (islower);
3517  else if (strcmp (name, "space") == 0)
3518    BUILD_CHARCLASS_LOOP (isspace);
3519  else if (strcmp (name, "alpha") == 0)
3520    BUILD_CHARCLASS_LOOP (isalpha);
3521  else if (strcmp (name, "digit") == 0)
3522    BUILD_CHARCLASS_LOOP (isdigit);
3523  else if (strcmp (name, "print") == 0)
3524    BUILD_CHARCLASS_LOOP (isprint);
3525  else if (strcmp (name, "upper") == 0)
3526    BUILD_CHARCLASS_LOOP (isupper);
3527  else if (strcmp (name, "blank") == 0)
3528    BUILD_CHARCLASS_LOOP (isblank);
3529  else if (strcmp (name, "graph") == 0)
3530    BUILD_CHARCLASS_LOOP (isgraph);
3531  else if (strcmp (name, "punct") == 0)
3532    BUILD_CHARCLASS_LOOP (ispunct);
3533  else if (strcmp (name, "xdigit") == 0)
3534    BUILD_CHARCLASS_LOOP (isxdigit);
3535  else
3536    return REG_ECTYPE;
3537
3538  return REG_NOERROR;
3539}
3540
3541static bin_tree_t *
3542build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3543		    const unsigned char *class_name,
3544		    const unsigned char *extra, int non_match,
3545		    reg_errcode_t *err)
3546{
3547  re_bitset_ptr_t sbcset;
3548#ifdef RE_ENABLE_I18N
3549  re_charset_t *mbcset;
3550  int alloc = 0;
3551#endif /* not RE_ENABLE_I18N */
3552  reg_errcode_t ret;
3553  re_token_t br_token;
3554  bin_tree_t *tree;
3555
3556  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3557#ifdef RE_ENABLE_I18N
3558  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3559#endif /* RE_ENABLE_I18N */
3560
3561#ifdef RE_ENABLE_I18N
3562  if (BE (sbcset == NULL || mbcset == NULL, 0))
3563#else /* not RE_ENABLE_I18N */
3564  if (BE (sbcset == NULL, 0))
3565#endif /* not RE_ENABLE_I18N */
3566    {
3567      *err = REG_ESPACE;
3568      return NULL;
3569    }
3570
3571  if (non_match)
3572    {
3573#ifdef RE_ENABLE_I18N
3574      mbcset->non_match = 1;
3575#endif /* not RE_ENABLE_I18N */
3576    }
3577
3578  /* We don't care the syntax in this case.  */
3579  ret = build_charclass (trans, sbcset,
3580#ifdef RE_ENABLE_I18N
3581			 mbcset, &alloc,
3582#endif /* RE_ENABLE_I18N */
3583			 class_name, 0);
3584
3585  if (BE (ret != REG_NOERROR, 0))
3586    {
3587      re_free (sbcset);
3588#ifdef RE_ENABLE_I18N
3589      free_charset (mbcset);
3590#endif /* RE_ENABLE_I18N */
3591      *err = ret;
3592      return NULL;
3593    }
3594  /* \w match '_' also.  */
3595  for (; *extra; extra++)
3596    bitset_set (sbcset, *extra);
3597
3598  /* If it is non-matching list.  */
3599  if (non_match)
3600    bitset_not (sbcset);
3601
3602#ifdef RE_ENABLE_I18N
3603  /* Ensure only single byte characters are set.  */
3604  if (dfa->mb_cur_max > 1)
3605    bitset_mask (sbcset, dfa->sb_char);
3606#endif
3607
3608  /* Build a tree for simple bracket.  */
3609  br_token.type = SIMPLE_BRACKET;
3610  br_token.opr.sbcset = sbcset;
3611  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3612  if (BE (tree == NULL, 0))
3613    goto build_word_op_espace;
3614
3615#ifdef RE_ENABLE_I18N
3616  if (dfa->mb_cur_max > 1)
3617    {
3618      bin_tree_t *mbc_tree;
3619      /* Build a tree for complex bracket.  */
3620      br_token.type = COMPLEX_BRACKET;
3621      br_token.opr.mbcset = mbcset;
3622      dfa->has_mb_node = 1;
3623      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3624      if (BE (mbc_tree == NULL, 0))
3625	goto build_word_op_espace;
3626      /* Then join them by ALT node.  */
3627      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3628      if (BE (mbc_tree != NULL, 1))
3629	return tree;
3630    }
3631  else
3632    {
3633      free_charset (mbcset);
3634      return tree;
3635    }
3636#else /* not RE_ENABLE_I18N */
3637  return tree;
3638#endif /* not RE_ENABLE_I18N */
3639
3640 build_word_op_espace:
3641  re_free (sbcset);
3642#ifdef RE_ENABLE_I18N
3643  free_charset (mbcset);
3644#endif /* RE_ENABLE_I18N */
3645  *err = REG_ESPACE;
3646  return NULL;
3647}
3648
3649/* This is intended for the expressions like "a{1,3}".
3650   Fetch a number from `input', and return the number.
3651   Return -1, if the number field is empty like "{,1}".
3652   Return -2, If an error is occured.  */
3653
3654static int
3655fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3656{
3657  int num = -1;
3658  unsigned char c;
3659  while (1)
3660    {
3661      fetch_token (token, input, syntax);
3662      c = token->opr.c;
3663      if (BE (token->type == END_OF_RE, 0))
3664	return -2;
3665      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3666	break;
3667      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3668	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3669      num = (num > RE_DUP_MAX) ? -2 : num;
3670    }
3671  return num;
3672}
3673
3674#ifdef RE_ENABLE_I18N
3675static void
3676free_charset (re_charset_t *cset)
3677{
3678  re_free (cset->mbchars);
3679# ifdef _LIBC
3680  re_free (cset->coll_syms);
3681  re_free (cset->equiv_classes);
3682  re_free (cset->range_starts);
3683  re_free (cset->range_ends);
3684# endif
3685  re_free (cset->char_classes);
3686  re_free (cset);
3687}
3688#endif /* RE_ENABLE_I18N */
3689
3690/* Functions for binary tree operation.  */
3691
3692/* Create a tree node.  */
3693
3694static bin_tree_t *
3695create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3696	     re_token_type_t type)
3697{
3698  re_token_t t;
3699  t.type = type;
3700  return create_token_tree (dfa, left, right, &t);
3701}
3702
3703static bin_tree_t *
3704create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3705		   const re_token_t *token)
3706{
3707  bin_tree_t *tree;
3708  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3709    {
3710      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3711
3712      if (storage == NULL)
3713	return NULL;
3714      storage->next = dfa->str_tree_storage;
3715      dfa->str_tree_storage = storage;
3716      dfa->str_tree_storage_idx = 0;
3717    }
3718  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3719
3720  tree->parent = NULL;
3721  tree->left = left;
3722  tree->right = right;
3723  tree->token = *token;
3724  tree->token.duplicated = 0;
3725  tree->token.opt_subexp = 0;
3726  tree->first = NULL;
3727  tree->next = NULL;
3728  tree->node_idx = -1;
3729
3730  if (left != NULL)
3731    left->parent = tree;
3732  if (right != NULL)
3733    right->parent = tree;
3734  return tree;
3735}
3736
3737/* Mark the tree SRC as an optional subexpression.
3738   To be called from preorder or postorder.  */
3739
3740static reg_errcode_t
3741mark_opt_subexp (void *extra, bin_tree_t *node)
3742{
3743  int idx = (int) (long) extra;
3744  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3745    node->token.opt_subexp = 1;
3746
3747  return REG_NOERROR;
3748}
3749
3750/* Free the allocated memory inside NODE. */
3751
3752static void
3753free_token (re_token_t *node)
3754{
3755#ifdef RE_ENABLE_I18N
3756  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3757    free_charset (node->opr.mbcset);
3758  else
3759#endif /* RE_ENABLE_I18N */
3760    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3761      re_free (node->opr.sbcset);
3762}
3763
3764/* Worker function for tree walking.  Free the allocated memory inside NODE
3765   and its children. */
3766
3767static reg_errcode_t
3768free_tree (void *extra, bin_tree_t *node)
3769{
3770  free_token (&node->token);
3771  return REG_NOERROR;
3772}
3773
3774
3775/* Duplicate the node SRC, and return new node.  This is a preorder
3776   visit similar to the one implemented by the generic visitor, but
3777   we need more infrastructure to maintain two parallel trees --- so,
3778   it's easier to duplicate.  */
3779
3780static bin_tree_t *
3781duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3782{
3783  const bin_tree_t *node;
3784  bin_tree_t *dup_root;
3785  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3786
3787  for (node = root; ; )
3788    {
3789      /* Create a new tree and link it back to the current parent.  */
3790      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3791      if (*p_new == NULL)
3792	return NULL;
3793      (*p_new)->parent = dup_node;
3794      (*p_new)->token.duplicated = 1;
3795      dup_node = *p_new;
3796
3797      /* Go to the left node, or up and to the right.  */
3798      if (node->left)
3799	{
3800	  node = node->left;
3801	  p_new = &dup_node->left;
3802	}
3803      else
3804	{
3805	  const bin_tree_t *prev = NULL;
3806	  while (node->right == prev || node->right == NULL)
3807	    {
3808	      prev = node;
3809	      node = node->parent;
3810	      dup_node = dup_node->parent;
3811	      if (!node)
3812	        return dup_root;
3813	    }
3814	  node = node->right;
3815	  p_new = &dup_node->right;
3816	}
3817    }
3818}
3819