regcomp.c revision 250724
1/* Extended regular expression matching and search library.
2   Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <http://www.gnu.org/licenses/>.  */
19
20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21					  size_t length, reg_syntax_t syntax);
22static void re_compile_fastmap_iter (regex_t *bufp,
23				     const re_dfastate_t *init_state,
24				     char *fastmap);
25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26#ifdef RE_ENABLE_I18N
27static void free_charset (re_charset_t *cset);
28#endif /* RE_ENABLE_I18N */
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36			       reg_errcode_t (fn (void *, bin_tree_t *)),
37			       void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39				reg_errcode_t (fn (void *, bin_tree_t *)),
40				void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44				 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50				   unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53					 int node, int root);
54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55static int fetch_number (re_string_t *input, re_token_t *token,
56			 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
58			reg_syntax_t syntax) internal_function;
59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60			  reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62				  re_token_t *token, reg_syntax_t syntax,
63				  int nest, reg_errcode_t *err);
64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65				 re_token_t *token, reg_syntax_t syntax,
66				 int nest, reg_errcode_t *err);
67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68				     re_token_t *token, reg_syntax_t syntax,
69				     int nest, reg_errcode_t *err);
70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71				  re_token_t *token, reg_syntax_t syntax,
72				  int nest, reg_errcode_t *err);
73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74				 re_dfa_t *dfa, re_token_t *token,
75				 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77				      re_token_t *token, reg_syntax_t syntax,
78				      reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80					    re_string_t *regexp,
81					    re_token_t *token, int token_len,
82					    re_dfa_t *dfa,
83					    reg_syntax_t syntax,
84					    int accept_hyphen);
85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86					  re_string_t *regexp,
87					  re_token_t *token);
88#ifdef RE_ENABLE_I18N
89static reg_errcode_t build_equiv_class (bitset_t sbcset,
90					re_charset_t *mbcset,
91					int *equiv_class_alloc,
92					const unsigned char *name);
93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94				      bitset_t sbcset,
95				      re_charset_t *mbcset,
96				      int *char_class_alloc,
97				      const unsigned char *class_name,
98				      reg_syntax_t syntax);
99#else  /* not RE_ENABLE_I18N */
100static reg_errcode_t build_equiv_class (bitset_t sbcset,
101					const unsigned char *name);
102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103				      bitset_t sbcset,
104				      const unsigned char *class_name,
105				      reg_syntax_t syntax);
106#endif /* not RE_ENABLE_I18N */
107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108				       RE_TRANSLATE_TYPE trans,
109				       const unsigned char *class_name,
110				       const unsigned char *extra,
111				       int non_match, reg_errcode_t *err);
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113				bin_tree_t *left, bin_tree_t *right,
114				re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116				      bin_tree_t *left, bin_tree_t *right,
117				      const re_token_t *token);
118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122
123/* This table gives an error message for each of the error codes listed
124   in regex.h.  Obviously the order here has to be same as there.
125   POSIX doesn't require that we do anything for REG_NOERROR,
126   but why not be nice?  */
127
128const char __re_error_msgid[] attribute_hidden =
129  {
130#define REG_NOERROR_IDX	0
131    gettext_noop ("Success")	/* REG_NOERROR */
132    "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134    gettext_noop ("No match")	/* REG_NOMATCH */
135    "\0"
136#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138    "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141    "\0"
142#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144    "\0"
145#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147    "\0"
148#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150    "\0"
151#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153    "\0"
154#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156    "\0"
157#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159    "\0"
160#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162    "\0"
163#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164    gettext_noop ("Invalid range end")	/* REG_ERANGE */
165    "\0"
166#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167    gettext_noop ("Memory exhausted") /* REG_ESPACE */
168    "\0"
169#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171    "\0"
172#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173    gettext_noop ("Premature end of regular expression") /* REG_EEND */
174    "\0"
175#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176    gettext_noop ("Regular expression too big") /* REG_ESIZE */
177    "\0"
178#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180  };
181
182const size_t __re_error_msgid_idx[] attribute_hidden =
183  {
184    REG_NOERROR_IDX,
185    REG_NOMATCH_IDX,
186    REG_BADPAT_IDX,
187    REG_ECOLLATE_IDX,
188    REG_ECTYPE_IDX,
189    REG_EESCAPE_IDX,
190    REG_ESUBREG_IDX,
191    REG_EBRACK_IDX,
192    REG_EPAREN_IDX,
193    REG_EBRACE_IDX,
194    REG_BADBR_IDX,
195    REG_ERANGE_IDX,
196    REG_ESPACE_IDX,
197    REG_BADRPT_IDX,
198    REG_EEND_IDX,
199    REG_ESIZE_IDX,
200    REG_ERPAREN_IDX
201  };
202
203/* Entry points for GNU code.  */
204
205/* re_compile_pattern is the GNU regular expression compiler: it
206   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207   Returns 0 if the pattern was valid, otherwise an error string.
208
209   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210   are set in BUFP on entry.  */
211
212const char *
213re_compile_pattern (pattern, length, bufp)
214    const char *pattern;
215    size_t length;
216    struct re_pattern_buffer *bufp;
217{
218  reg_errcode_t ret;
219
220  /* And GNU code determines whether or not to get register information
221     by passing null for the REGS argument to re_match, etc., not by
222     setting no_sub, unless RE_NO_SUB is set.  */
223  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
224
225  /* Match anchors at newline.  */
226  bufp->newline_anchor = 1;
227
228  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
229
230  if (!ret)
231    return NULL;
232  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233}
234#ifdef _LIBC
235weak_alias (__re_compile_pattern, re_compile_pattern)
236#endif
237
238/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
239   also be assigned to arbitrarily: each pattern buffer stores its own
240   syntax, so it can be changed between regex compilations.  */
241/* This has no initializer because initialized variables in Emacs
242   become read-only after dumping.  */
243reg_syntax_t re_syntax_options;
244
245
246/* Specify the precise syntax of regexps for compilation.  This provides
247   for compatibility for various utilities which historically have
248   different, incompatible syntaxes.
249
250   The argument SYNTAX is a bit mask comprised of the various bits
251   defined in regex.h.  We return the old syntax.  */
252
253reg_syntax_t
254re_set_syntax (syntax)
255    reg_syntax_t syntax;
256{
257  reg_syntax_t ret = re_syntax_options;
258
259  re_syntax_options = syntax;
260  return ret;
261}
262#ifdef _LIBC
263weak_alias (__re_set_syntax, re_set_syntax)
264#endif
265
266int
267re_compile_fastmap (bufp)
268    struct re_pattern_buffer *bufp;
269{
270  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
271  char *fastmap = bufp->fastmap;
272
273  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275  if (dfa->init_state != dfa->init_state_word)
276    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277  if (dfa->init_state != dfa->init_state_nl)
278    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279  if (dfa->init_state != dfa->init_state_begbuf)
280    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281  bufp->fastmap_accurate = 1;
282  return 0;
283}
284#ifdef _LIBC
285weak_alias (__re_compile_fastmap, re_compile_fastmap)
286#endif
287
288static inline void
289__attribute ((always_inline))
290re_set_fastmap (char *fastmap, int icase, int ch)
291{
292  fastmap[ch] = 1;
293  if (icase)
294    fastmap[tolower (ch)] = 1;
295}
296
297/* Helper function for re_compile_fastmap.
298   Compile fastmap for the initial_state INIT_STATE.  */
299
300static void
301re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302			 char *fastmap)
303{
304  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
305  int node_cnt;
306  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308    {
309      int node = init_state->nodes.elems[node_cnt];
310      re_token_type_t type = dfa->nodes[node].type;
311
312      if (type == CHARACTER)
313	{
314	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315#ifdef RE_ENABLE_I18N
316	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
317	    {
318	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
319	      wchar_t wc;
320	      mbstate_t state;
321
322	      p = buf;
323	      *p++ = dfa->nodes[node].opr.c;
324	      while (++node < dfa->nodes_len
325		     &&	dfa->nodes[node].type == CHARACTER
326		     && dfa->nodes[node].mb_partial)
327		*p++ = dfa->nodes[node].opr.c;
328	      memset (&state, '\0', sizeof (state));
329	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
330			     &state) == p - buf
331		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
332		      != (size_t) -1))
333		re_set_fastmap (fastmap, 0, buf[0]);
334	    }
335#endif
336	}
337      else if (type == SIMPLE_BRACKET)
338	{
339	  int i, ch;
340	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
341	    {
342	      int j;
343	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
344	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
345		if (w & ((bitset_word_t) 1 << j))
346		  re_set_fastmap (fastmap, icase, ch);
347	    }
348	}
349#ifdef RE_ENABLE_I18N
350      else if (type == COMPLEX_BRACKET)
351	{
352	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
353	  int i;
354
355# ifdef _LIBC
356	  /* See if we have to try all bytes which start multiple collation
357	     elements.
358	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
359		  collation element, and don't catch 'b' since 'b' is
360		  the only collation element which starts from 'b' (and
361		  it is caught by SIMPLE_BRACKET).  */
362	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
363		  && (cset->ncoll_syms || cset->nranges))
364		{
365		  const int32_t *table = (const int32_t *)
366		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
367		  for (i = 0; i < SBC_MAX; ++i)
368		    if (table[i] < 0)
369		      re_set_fastmap (fastmap, icase, i);
370		}
371# endif /* _LIBC */
372
373	  /* See if we have to start the match at all multibyte characters,
374	     i.e. where we would not find an invalid sequence.  This only
375	     applies to multibyte character sets; for single byte character
376	     sets, the SIMPLE_BRACKET again suffices.  */
377	  if (dfa->mb_cur_max > 1
378	      && (cset->nchar_classes || cset->non_match || cset->nranges
379# ifdef _LIBC
380		  || cset->nequiv_classes
381# endif /* _LIBC */
382		 ))
383	    {
384	      unsigned char c = 0;
385	      do
386		{
387		  mbstate_t mbs;
388		  memset (&mbs, 0, sizeof (mbs));
389		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
390		    re_set_fastmap (fastmap, false, (int) c);
391		}
392	      while (++c != 0);
393	    }
394
395	  else
396	    {
397	      /* ... Else catch all bytes which can start the mbchars.  */
398	      for (i = 0; i < cset->nmbchars; ++i)
399		{
400		  char buf[256];
401		  mbstate_t state;
402		  memset (&state, '\0', sizeof (state));
403		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
404		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
405		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
406		    {
407		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
408			  != (size_t) -1)
409			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
410		    }
411		}
412	    }
413	}
414#endif /* RE_ENABLE_I18N */
415      else if (type == OP_PERIOD
416#ifdef RE_ENABLE_I18N
417	       || type == OP_UTF8_PERIOD
418#endif /* RE_ENABLE_I18N */
419	       || type == END_OF_RE)
420	{
421	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
422	  if (type == END_OF_RE)
423	    bufp->can_be_null = 1;
424	  return;
425	}
426    }
427}
428
429/* Entry point for POSIX code.  */
430/* regcomp takes a regular expression as a string and compiles it.
431
432   PREG is a regex_t *.  We do not expect any fields to be initialized,
433   since POSIX says we shouldn't.  Thus, we set
434
435     `buffer' to the compiled pattern;
436     `used' to the length of the compiled pattern;
437     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438       REG_EXTENDED bit in CFLAGS is set; otherwise, to
439       RE_SYNTAX_POSIX_BASIC;
440     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441     `fastmap' to an allocated space for the fastmap;
442     `fastmap_accurate' to zero;
443     `re_nsub' to the number of subexpressions in PATTERN.
444
445   PATTERN is the address of the pattern string.
446
447   CFLAGS is a series of bits which affect compilation.
448
449     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450     use POSIX basic syntax.
451
452     If REG_NEWLINE is set, then . and [^...] don't match newline.
453     Also, regexec will try a match beginning after every newline.
454
455     If REG_ICASE is set, then we considers upper- and lowercase
456     versions of letters to be equivalent when matching.
457
458     If REG_NOSUB is set, then when PREG is passed to regexec, that
459     routine will report only success or failure, and nothing about the
460     registers.
461
462   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
463   the return codes and their meanings.)  */
464
465int
466regcomp (preg, pattern, cflags)
467    regex_t *__restrict preg;
468    const char *__restrict pattern;
469    int cflags;
470{
471  reg_errcode_t ret;
472  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
473			 : RE_SYNTAX_POSIX_BASIC);
474
475  preg->buffer = NULL;
476  preg->allocated = 0;
477  preg->used = 0;
478
479  /* Try to allocate space for the fastmap.  */
480  preg->fastmap = re_malloc (char, SBC_MAX);
481  if (BE (preg->fastmap == NULL, 0))
482    return REG_ESPACE;
483
484  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
485
486  /* If REG_NEWLINE is set, newlines are treated differently.  */
487  if (cflags & REG_NEWLINE)
488    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
489      syntax &= ~RE_DOT_NEWLINE;
490      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
491      /* It also changes the matching behavior.  */
492      preg->newline_anchor = 1;
493    }
494  else
495    preg->newline_anchor = 0;
496  preg->no_sub = !!(cflags & REG_NOSUB);
497  preg->translate = NULL;
498
499  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
500
501  /* POSIX doesn't distinguish between an unmatched open-group and an
502     unmatched close-group: both are REG_EPAREN.  */
503  if (ret == REG_ERPAREN)
504    ret = REG_EPAREN;
505
506  /* We have already checked preg->fastmap != NULL.  */
507  if (BE (ret == REG_NOERROR, 1))
508    /* Compute the fastmap now, since regexec cannot modify the pattern
509       buffer.  This function never fails in this implementation.  */
510    (void) re_compile_fastmap (preg);
511  else
512    {
513      /* Some error occurred while compiling the expression.  */
514      re_free (preg->fastmap);
515      preg->fastmap = NULL;
516    }
517
518  return (int) ret;
519}
520#ifdef _LIBC
521weak_alias (__regcomp, regcomp)
522#endif
523
524/* Returns a message corresponding to an error code, ERRCODE, returned
525   from either regcomp or regexec.   We don't use PREG here.  */
526
527size_t
528regerror (errcode, preg, errbuf, errbuf_size)
529    int errcode;
530    const regex_t *__restrict preg;
531    char *__restrict errbuf;
532    size_t errbuf_size;
533{
534  const char *msg;
535  size_t msg_size;
536
537  if (BE (errcode < 0
538	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
539			       / sizeof (__re_error_msgid_idx[0])), 0))
540    /* Only error codes returned by the rest of the code should be passed
541       to this routine.  If we are given anything else, or if other regex
542       code generates an invalid error code, then the program has a bug.
543       Dump core so we can fix it.  */
544    abort ();
545
546  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
547
548  msg_size = strlen (msg) + 1; /* Includes the null.  */
549
550  if (BE (errbuf_size != 0, 1))
551    {
552      if (BE (msg_size > errbuf_size, 0))
553	{
554#if defined HAVE_MEMPCPY || defined _LIBC
555	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
556#else
557	  memcpy (errbuf, msg, errbuf_size - 1);
558	  errbuf[errbuf_size - 1] = 0;
559#endif
560	}
561      else
562	memcpy (errbuf, msg, msg_size);
563    }
564
565  return msg_size;
566}
567#ifdef _LIBC
568weak_alias (__regerror, regerror)
569#endif
570
571
572#ifdef RE_ENABLE_I18N
573/* This static array is used for the map to single-byte characters when
574   UTF-8 is used.  Otherwise we would allocate memory just to initialize
575   it the same all the time.  UTF-8 is the preferred encoding so this is
576   a worthwhile optimization.  */
577static const bitset_t utf8_sb_map =
578{
579  /* Set the first 128 bits.  */
580  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
581};
582#endif
583
584
585static void
586free_dfa_content (re_dfa_t *dfa)
587{
588  int i, j;
589
590  if (dfa->nodes)
591    for (i = 0; i < dfa->nodes_len; ++i)
592      free_token (dfa->nodes + i);
593  re_free (dfa->nexts);
594  for (i = 0; i < dfa->nodes_len; ++i)
595    {
596      if (dfa->eclosures != NULL)
597	re_node_set_free (dfa->eclosures + i);
598      if (dfa->inveclosures != NULL)
599	re_node_set_free (dfa->inveclosures + i);
600      if (dfa->edests != NULL)
601	re_node_set_free (dfa->edests + i);
602    }
603  re_free (dfa->edests);
604  re_free (dfa->eclosures);
605  re_free (dfa->inveclosures);
606  re_free (dfa->nodes);
607
608  if (dfa->state_table)
609    for (i = 0; i <= dfa->state_hash_mask; ++i)
610      {
611	struct re_state_table_entry *entry = dfa->state_table + i;
612	for (j = 0; j < entry->num; ++j)
613	  {
614	    re_dfastate_t *state = entry->array[j];
615	    free_state (state);
616	  }
617	re_free (entry->array);
618      }
619  re_free (dfa->state_table);
620#ifdef RE_ENABLE_I18N
621  if (dfa->sb_char != utf8_sb_map)
622    re_free (dfa->sb_char);
623#endif
624  re_free (dfa->subexp_map);
625#ifdef DEBUG
626  re_free (dfa->re_str);
627#endif
628
629  re_free (dfa);
630}
631
632
633/* Free dynamically allocated space used by PREG.  */
634
635void
636regfree (preg)
637    regex_t *preg;
638{
639  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
640  if (BE (dfa != NULL, 1))
641    free_dfa_content (dfa);
642  preg->buffer = NULL;
643  preg->allocated = 0;
644
645  re_free (preg->fastmap);
646  preg->fastmap = NULL;
647
648  re_free (preg->translate);
649  preg->translate = NULL;
650}
651#ifdef _LIBC
652weak_alias (__regfree, regfree)
653#endif
654
655/* Entry points compatible with 4.2 BSD regex library.  We don't define
656   them unless specifically requested.  */
657
658#if defined _REGEX_RE_COMP || defined _LIBC
659
660/* BSD has one and only one pattern buffer.  */
661static struct re_pattern_buffer re_comp_buf;
662
663char *
664# ifdef _LIBC
665/* Make these definitions weak in libc, so POSIX programs can redefine
666   these names if they don't use our functions, and still use
667   regcomp/regexec above without link errors.  */
668weak_function
669# endif
670re_comp (s)
671     const char *s;
672{
673  reg_errcode_t ret;
674  char *fastmap;
675
676  if (!s)
677    {
678      if (!re_comp_buf.buffer)
679	return gettext ("No previous regular expression");
680      return 0;
681    }
682
683  if (re_comp_buf.buffer)
684    {
685      fastmap = re_comp_buf.fastmap;
686      re_comp_buf.fastmap = NULL;
687      __regfree (&re_comp_buf);
688      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
689      re_comp_buf.fastmap = fastmap;
690    }
691
692  if (re_comp_buf.fastmap == NULL)
693    {
694      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
695      if (re_comp_buf.fastmap == NULL)
696	return (char *) gettext (__re_error_msgid
697				 + __re_error_msgid_idx[(int) REG_ESPACE]);
698    }
699
700  /* Since `re_exec' always passes NULL for the `regs' argument, we
701     don't need to initialize the pattern buffer fields which affect it.  */
702
703  /* Match anchors at newlines.  */
704  re_comp_buf.newline_anchor = 1;
705
706  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
707
708  if (!ret)
709    return NULL;
710
711  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
712  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
713}
714
715#ifdef _LIBC
716libc_freeres_fn (free_mem)
717{
718  __regfree (&re_comp_buf);
719}
720#endif
721
722#endif /* _REGEX_RE_COMP */
723
724/* Internal entry point.
725   Compile the regular expression PATTERN, whose length is LENGTH.
726   SYNTAX indicate regular expression's syntax.  */
727
728static reg_errcode_t
729re_compile_internal (regex_t *preg, const char * pattern, size_t length,
730		     reg_syntax_t syntax)
731{
732  reg_errcode_t err = REG_NOERROR;
733  re_dfa_t *dfa;
734  re_string_t regexp;
735
736  /* Initialize the pattern buffer.  */
737  preg->fastmap_accurate = 0;
738  preg->syntax = syntax;
739  preg->not_bol = preg->not_eol = 0;
740  preg->used = 0;
741  preg->re_nsub = 0;
742  preg->can_be_null = 0;
743  preg->regs_allocated = REGS_UNALLOCATED;
744
745  /* Initialize the dfa.  */
746  dfa = (re_dfa_t *) preg->buffer;
747  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
748    {
749      /* If zero allocated, but buffer is non-null, try to realloc
750	 enough space.  This loses if buffer's address is bogus, but
751	 that is the user's responsibility.  If ->buffer is NULL this
752	 is a simple allocation.  */
753      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
754      if (dfa == NULL)
755	return REG_ESPACE;
756      preg->allocated = sizeof (re_dfa_t);
757      preg->buffer = (unsigned char *) dfa;
758    }
759  preg->used = sizeof (re_dfa_t);
760
761  err = init_dfa (dfa, length);
762  if (BE (err != REG_NOERROR, 0))
763    {
764      free_dfa_content (dfa);
765      preg->buffer = NULL;
766      preg->allocated = 0;
767      return err;
768    }
769#ifdef DEBUG
770  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
771  dfa->re_str = re_malloc (char, length + 1);
772  strncpy (dfa->re_str, pattern, length + 1);
773#endif
774
775  __libc_lock_init (dfa->lock);
776
777  err = re_string_construct (&regexp, pattern, length, preg->translate,
778			     syntax & RE_ICASE, dfa);
779  if (BE (err != REG_NOERROR, 0))
780    {
781    re_compile_internal_free_return:
782      free_workarea_compile (preg);
783      re_string_destruct (&regexp);
784      free_dfa_content (dfa);
785      preg->buffer = NULL;
786      preg->allocated = 0;
787      return err;
788    }
789
790  /* Parse the regular expression, and build a structure tree.  */
791  preg->re_nsub = 0;
792  dfa->str_tree = parse (&regexp, preg, syntax, &err);
793  if (BE (dfa->str_tree == NULL, 0))
794    goto re_compile_internal_free_return;
795
796  /* Analyze the tree and create the nfa.  */
797  err = analyze (preg);
798  if (BE (err != REG_NOERROR, 0))
799    goto re_compile_internal_free_return;
800
801#ifdef RE_ENABLE_I18N
802  /* If possible, do searching in single byte encoding to speed things up.  */
803  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
804    optimize_utf8 (dfa);
805#endif
806
807  /* Then create the initial state of the dfa.  */
808  err = create_initial_state (dfa);
809
810  /* Release work areas.  */
811  free_workarea_compile (preg);
812  re_string_destruct (&regexp);
813
814  if (BE (err != REG_NOERROR, 0))
815    {
816      free_dfa_content (dfa);
817      preg->buffer = NULL;
818      preg->allocated = 0;
819    }
820
821  return err;
822}
823
824/* Initialize DFA.  We use the length of the regular expression PAT_LEN
825   as the initial length of some arrays.  */
826
827static reg_errcode_t
828init_dfa (re_dfa_t *dfa, size_t pat_len)
829{
830  unsigned int table_size;
831#ifndef _LIBC
832  char *codeset_name;
833#endif
834
835  memset (dfa, '\0', sizeof (re_dfa_t));
836
837  /* Force allocation of str_tree_storage the first time.  */
838  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
839
840  /* Avoid overflows.  */
841  if (pat_len == SIZE_MAX)
842    return REG_ESPACE;
843
844  dfa->nodes_alloc = pat_len + 1;
845  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
846
847  /*  table_size = 2 ^ ceil(log pat_len) */
848  for (table_size = 1; ; table_size <<= 1)
849    if (table_size > pat_len)
850      break;
851
852  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
853  dfa->state_hash_mask = table_size - 1;
854
855  dfa->mb_cur_max = MB_CUR_MAX;
856#ifdef _LIBC
857  if (dfa->mb_cur_max == 6
858      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
859    dfa->is_utf8 = 1;
860  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
861		       != 0);
862#else
863# ifdef HAVE_LANGINFO_CODESET
864  codeset_name = nl_langinfo (CODESET);
865# else
866  codeset_name = getenv ("LC_ALL");
867  if (codeset_name == NULL || codeset_name[0] == '\0')
868    codeset_name = getenv ("LC_CTYPE");
869  if (codeset_name == NULL || codeset_name[0] == '\0')
870    codeset_name = getenv ("LANG");
871  if (codeset_name == NULL)
872    codeset_name = "";
873  else if (strchr (codeset_name, '.') !=  NULL)
874    codeset_name = strchr (codeset_name, '.') + 1;
875# endif
876
877  if (strcasecmp (codeset_name, "UTF-8") == 0
878      || strcasecmp (codeset_name, "UTF8") == 0)
879    dfa->is_utf8 = 1;
880
881  /* We check exhaustively in the loop below if this charset is a
882     superset of ASCII.  */
883  dfa->map_notascii = 0;
884#endif
885
886#ifdef RE_ENABLE_I18N
887  if (dfa->mb_cur_max > 1)
888    {
889      if (dfa->is_utf8)
890	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
891      else
892	{
893	  int i, j, ch;
894
895	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
896	  if (BE (dfa->sb_char == NULL, 0))
897	    return REG_ESPACE;
898
899	  /* Set the bits corresponding to single byte chars.  */
900	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
902	      {
903		wint_t wch = __btowc (ch);
904		if (wch != WEOF)
905		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
906# ifndef _LIBC
907		if (isascii (ch) && wch != ch)
908		  dfa->map_notascii = 1;
909# endif
910	      }
911	}
912    }
913#endif
914
915  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
916    return REG_ESPACE;
917  return REG_NOERROR;
918}
919
920/* Initialize WORD_CHAR table, which indicate which character is
921   "word".  In this case "word" means that it is the word construction
922   character used by some operators like "\<", "\>", etc.  */
923
924static void
925internal_function
926init_word_char (re_dfa_t *dfa)
927{
928  dfa->word_ops_used = 1;
929  int i = 0;
930  int ch = 0;
931  if (BE (dfa->map_notascii == 0, 1))
932    {
933      if (sizeof (dfa->word_char[0]) == 8)
934	{
935          /* The extra temporaries here avoid "implicitly truncated"
936             warnings in the case when this is dead code, i.e. 32-bit.  */
937          const uint64_t wc0 = UINT64_C (0x03ff000000000000);
938          const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
939	  dfa->word_char[0] = wc0;
940	  dfa->word_char[1] = wc1;
941	  i = 2;
942	}
943      else if (sizeof (dfa->word_char[0]) == 4)
944	{
945	  dfa->word_char[0] = UINT32_C (0x00000000);
946	  dfa->word_char[1] = UINT32_C (0x03ff0000);
947	  dfa->word_char[2] = UINT32_C (0x87fffffe);
948	  dfa->word_char[3] = UINT32_C (0x07fffffe);
949	  i = 4;
950	}
951      else
952	abort ();
953      ch = 128;
954
955      if (BE (dfa->is_utf8, 1))
956	{
957	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
958	  return;
959	}
960    }
961
962  for (; i < BITSET_WORDS; ++i)
963    for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
964      if (isalnum (ch) || ch == '_')
965	dfa->word_char[i] |= (bitset_word_t) 1 << j;
966}
967
968/* Free the work area which are only used while compiling.  */
969
970static void
971free_workarea_compile (regex_t *preg)
972{
973  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
974  bin_tree_storage_t *storage, *next;
975  for (storage = dfa->str_tree_storage; storage; storage = next)
976    {
977      next = storage->next;
978      re_free (storage);
979    }
980  dfa->str_tree_storage = NULL;
981  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
982  dfa->str_tree = NULL;
983  re_free (dfa->org_indices);
984  dfa->org_indices = NULL;
985}
986
987/* Create initial states for all contexts.  */
988
989static reg_errcode_t
990create_initial_state (re_dfa_t *dfa)
991{
992  int first, i;
993  reg_errcode_t err;
994  re_node_set init_nodes;
995
996  /* Initial states have the epsilon closure of the node which is
997     the first node of the regular expression.  */
998  first = dfa->str_tree->first->node_idx;
999  dfa->init_node = first;
1000  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1001  if (BE (err != REG_NOERROR, 0))
1002    return err;
1003
1004  /* The back-references which are in initial states can epsilon transit,
1005     since in this case all of the subexpressions can be null.
1006     Then we add epsilon closures of the nodes which are the next nodes of
1007     the back-references.  */
1008  if (dfa->nbackref > 0)
1009    for (i = 0; i < init_nodes.nelem; ++i)
1010      {
1011	int node_idx = init_nodes.elems[i];
1012	re_token_type_t type = dfa->nodes[node_idx].type;
1013
1014	int clexp_idx;
1015	if (type != OP_BACK_REF)
1016	  continue;
1017	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1018	  {
1019	    re_token_t *clexp_node;
1020	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1021	    if (clexp_node->type == OP_CLOSE_SUBEXP
1022		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1023	      break;
1024	  }
1025	if (clexp_idx == init_nodes.nelem)
1026	  continue;
1027
1028	if (type == OP_BACK_REF)
1029	  {
1030	    int dest_idx = dfa->edests[node_idx].elems[0];
1031	    if (!re_node_set_contains (&init_nodes, dest_idx))
1032	      {
1033		reg_errcode_t err = re_node_set_merge (&init_nodes,
1034						       dfa->eclosures
1035						       + dest_idx);
1036		if (err != REG_NOERROR)
1037		  return err;
1038		i = 0;
1039	      }
1040	  }
1041      }
1042
1043  /* It must be the first time to invoke acquire_state.  */
1044  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1045  /* We don't check ERR here, since the initial state must not be NULL.  */
1046  if (BE (dfa->init_state == NULL, 0))
1047    return err;
1048  if (dfa->init_state->has_constraint)
1049    {
1050      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1051						       CONTEXT_WORD);
1052      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1053						     CONTEXT_NEWLINE);
1054      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1055							 &init_nodes,
1056							 CONTEXT_NEWLINE
1057							 | CONTEXT_BEGBUF);
1058      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1059	      || dfa->init_state_begbuf == NULL, 0))
1060	return err;
1061    }
1062  else
1063    dfa->init_state_word = dfa->init_state_nl
1064      = dfa->init_state_begbuf = dfa->init_state;
1065
1066  re_node_set_free (&init_nodes);
1067  return REG_NOERROR;
1068}
1069
1070#ifdef RE_ENABLE_I18N
1071/* If it is possible to do searching in single byte encoding instead of UTF-8
1072   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1073   DFA nodes where needed.  */
1074
1075static void
1076optimize_utf8 (re_dfa_t *dfa)
1077{
1078  int node, i, mb_chars = 0, has_period = 0;
1079
1080  for (node = 0; node < dfa->nodes_len; ++node)
1081    switch (dfa->nodes[node].type)
1082      {
1083      case CHARACTER:
1084	if (dfa->nodes[node].opr.c >= 0x80)
1085	  mb_chars = 1;
1086	break;
1087      case ANCHOR:
1088	switch (dfa->nodes[node].opr.ctx_type)
1089	  {
1090	  case LINE_FIRST:
1091	  case LINE_LAST:
1092	  case BUF_FIRST:
1093	  case BUF_LAST:
1094	    break;
1095	  default:
1096	    /* Word anchors etc. cannot be handled.  It's okay to test
1097	       opr.ctx_type since constraints (for all DFA nodes) are
1098	       created by ORing one or more opr.ctx_type values.  */
1099	    return;
1100	  }
1101	break;
1102      case OP_PERIOD:
1103	has_period = 1;
1104	break;
1105      case OP_BACK_REF:
1106      case OP_ALT:
1107      case END_OF_RE:
1108      case OP_DUP_ASTERISK:
1109      case OP_OPEN_SUBEXP:
1110      case OP_CLOSE_SUBEXP:
1111	break;
1112      case COMPLEX_BRACKET:
1113	return;
1114      case SIMPLE_BRACKET:
1115	/* Just double check.  The non-ASCII range starts at 0x80.  */
1116	assert (0x80 % BITSET_WORD_BITS == 0);
1117	for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1118	  if (dfa->nodes[node].opr.sbcset[i])
1119	    return;
1120	break;
1121      default:
1122	abort ();
1123      }
1124
1125  if (mb_chars || has_period)
1126    for (node = 0; node < dfa->nodes_len; ++node)
1127      {
1128	if (dfa->nodes[node].type == CHARACTER
1129	    && dfa->nodes[node].opr.c >= 0x80)
1130	  dfa->nodes[node].mb_partial = 0;
1131	else if (dfa->nodes[node].type == OP_PERIOD)
1132	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1133      }
1134
1135  /* The search can be in single byte locale.  */
1136  dfa->mb_cur_max = 1;
1137  dfa->is_utf8 = 0;
1138  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1139}
1140#endif
1141
1142/* Analyze the structure tree, and calculate "first", "next", "edest",
1143   "eclosure", and "inveclosure".  */
1144
1145static reg_errcode_t
1146analyze (regex_t *preg)
1147{
1148  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1149  reg_errcode_t ret;
1150
1151  /* Allocate arrays.  */
1152  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1153  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1154  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1155  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1156  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1157	  || dfa->eclosures == NULL, 0))
1158    return REG_ESPACE;
1159
1160  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1161  if (dfa->subexp_map != NULL)
1162    {
1163      int i;
1164      for (i = 0; i < preg->re_nsub; i++)
1165	dfa->subexp_map[i] = i;
1166      preorder (dfa->str_tree, optimize_subexps, dfa);
1167      for (i = 0; i < preg->re_nsub; i++)
1168	if (dfa->subexp_map[i] != i)
1169	  break;
1170      if (i == preg->re_nsub)
1171	{
1172	  free (dfa->subexp_map);
1173	  dfa->subexp_map = NULL;
1174	}
1175    }
1176
1177  ret = postorder (dfa->str_tree, lower_subexps, preg);
1178  if (BE (ret != REG_NOERROR, 0))
1179    return ret;
1180  ret = postorder (dfa->str_tree, calc_first, dfa);
1181  if (BE (ret != REG_NOERROR, 0))
1182    return ret;
1183  preorder (dfa->str_tree, calc_next, dfa);
1184  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1185  if (BE (ret != REG_NOERROR, 0))
1186    return ret;
1187  ret = calc_eclosure (dfa);
1188  if (BE (ret != REG_NOERROR, 0))
1189    return ret;
1190
1191  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1193  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1194      || dfa->nbackref)
1195    {
1196      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1197      if (BE (dfa->inveclosures == NULL, 0))
1198	return REG_ESPACE;
1199      ret = calc_inveclosure (dfa);
1200    }
1201
1202  return ret;
1203}
1204
1205/* Our parse trees are very unbalanced, so we cannot use a stack to
1206   implement parse tree visits.  Instead, we use parent pointers and
1207   some hairy code in these two functions.  */
1208static reg_errcode_t
1209postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1210	   void *extra)
1211{
1212  bin_tree_t *node, *prev;
1213
1214  for (node = root; ; )
1215    {
1216      /* Descend down the tree, preferably to the left (or to the right
1217	 if that's the only child).  */
1218      while (node->left || node->right)
1219	if (node->left)
1220	  node = node->left;
1221	else
1222	  node = node->right;
1223
1224      do
1225	{
1226	  reg_errcode_t err = fn (extra, node);
1227	  if (BE (err != REG_NOERROR, 0))
1228	    return err;
1229	  if (node->parent == NULL)
1230	    return REG_NOERROR;
1231	  prev = node;
1232	  node = node->parent;
1233	}
1234      /* Go up while we have a node that is reached from the right.  */
1235      while (node->right == prev || node->right == NULL);
1236      node = node->right;
1237    }
1238}
1239
1240static reg_errcode_t
1241preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242	  void *extra)
1243{
1244  bin_tree_t *node;
1245
1246  for (node = root; ; )
1247    {
1248      reg_errcode_t err = fn (extra, node);
1249      if (BE (err != REG_NOERROR, 0))
1250	return err;
1251
1252      /* Go to the left node, or up and to the right.  */
1253      if (node->left)
1254	node = node->left;
1255      else
1256	{
1257	  bin_tree_t *prev = NULL;
1258	  while (node->right == prev || node->right == NULL)
1259	    {
1260	      prev = node;
1261	      node = node->parent;
1262	      if (!node)
1263		return REG_NOERROR;
1264	    }
1265	  node = node->right;
1266	}
1267    }
1268}
1269
1270/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1272   backreferences as well.  Requires a preorder visit.  */
1273static reg_errcode_t
1274optimize_subexps (void *extra, bin_tree_t *node)
1275{
1276  re_dfa_t *dfa = (re_dfa_t *) extra;
1277
1278  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1279    {
1280      int idx = node->token.opr.idx;
1281      node->token.opr.idx = dfa->subexp_map[idx];
1282      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1283    }
1284
1285  else if (node->token.type == SUBEXP
1286	   && node->left && node->left->token.type == SUBEXP)
1287    {
1288      int other_idx = node->left->token.opr.idx;
1289
1290      node->left = node->left->left;
1291      if (node->left)
1292	node->left->parent = node;
1293
1294      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295      if (other_idx < BITSET_WORD_BITS)
1296	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1297    }
1298
1299  return REG_NOERROR;
1300}
1301
1302/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1304static reg_errcode_t
1305lower_subexps (void *extra, bin_tree_t *node)
1306{
1307  regex_t *preg = (regex_t *) extra;
1308  reg_errcode_t err = REG_NOERROR;
1309
1310  if (node->left && node->left->token.type == SUBEXP)
1311    {
1312      node->left = lower_subexp (&err, preg, node->left);
1313      if (node->left)
1314	node->left->parent = node;
1315    }
1316  if (node->right && node->right->token.type == SUBEXP)
1317    {
1318      node->right = lower_subexp (&err, preg, node->right);
1319      if (node->right)
1320	node->right->parent = node;
1321    }
1322
1323  return err;
1324}
1325
1326static bin_tree_t *
1327lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1328{
1329  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1330  bin_tree_t *body = node->left;
1331  bin_tree_t *op, *cls, *tree1, *tree;
1332
1333  if (preg->no_sub
1334      /* We do not optimize empty subexpressions, because otherwise we may
1335	 have bad CONCAT nodes with NULL children.  This is obviously not
1336	 very common, so we do not lose much.  An example that triggers
1337	 this case is the sed "script" /\(\)/x.  */
1338      && node->left != NULL
1339      && (node->token.opr.idx >= BITSET_WORD_BITS
1340	  || !(dfa->used_bkref_map
1341	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1342    return node->left;
1343
1344  /* Convert the SUBEXP node to the concatenation of an
1345     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1346  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349  tree = create_tree (dfa, op, tree1, CONCAT);
1350  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1351    {
1352      *err = REG_ESPACE;
1353      return NULL;
1354    }
1355
1356  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358  return tree;
1359}
1360
1361/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362   nodes.  Requires a postorder visit.  */
1363static reg_errcode_t
1364calc_first (void *extra, bin_tree_t *node)
1365{
1366  re_dfa_t *dfa = (re_dfa_t *) extra;
1367  if (node->token.type == CONCAT)
1368    {
1369      node->first = node->left->first;
1370      node->node_idx = node->left->node_idx;
1371    }
1372  else
1373    {
1374      node->first = node;
1375      node->node_idx = re_dfa_add_node (dfa, node->token);
1376      if (BE (node->node_idx == -1, 0))
1377	return REG_ESPACE;
1378      if (node->token.type == ANCHOR)
1379	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1380    }
1381  return REG_NOERROR;
1382}
1383
1384/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1385static reg_errcode_t
1386calc_next (void *extra, bin_tree_t *node)
1387{
1388  switch (node->token.type)
1389    {
1390    case OP_DUP_ASTERISK:
1391      node->left->next = node;
1392      break;
1393    case CONCAT:
1394      node->left->next = node->right->first;
1395      node->right->next = node->next;
1396      break;
1397    default:
1398      if (node->left)
1399	node->left->next = node->next;
1400      if (node->right)
1401	node->right->next = node->next;
1402      break;
1403    }
1404  return REG_NOERROR;
1405}
1406
1407/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1408static reg_errcode_t
1409link_nfa_nodes (void *extra, bin_tree_t *node)
1410{
1411  re_dfa_t *dfa = (re_dfa_t *) extra;
1412  int idx = node->node_idx;
1413  reg_errcode_t err = REG_NOERROR;
1414
1415  switch (node->token.type)
1416    {
1417    case CONCAT:
1418      break;
1419
1420    case END_OF_RE:
1421      assert (node->next == NULL);
1422      break;
1423
1424    case OP_DUP_ASTERISK:
1425    case OP_ALT:
1426      {
1427	int left, right;
1428	dfa->has_plural_match = 1;
1429	if (node->left != NULL)
1430	  left = node->left->first->node_idx;
1431	else
1432	  left = node->next->node_idx;
1433	if (node->right != NULL)
1434	  right = node->right->first->node_idx;
1435	else
1436	  right = node->next->node_idx;
1437	assert (left > -1);
1438	assert (right > -1);
1439	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1440      }
1441      break;
1442
1443    case ANCHOR:
1444    case OP_OPEN_SUBEXP:
1445    case OP_CLOSE_SUBEXP:
1446      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1447      break;
1448
1449    case OP_BACK_REF:
1450      dfa->nexts[idx] = node->next->node_idx;
1451      if (node->token.type == OP_BACK_REF)
1452	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1453      break;
1454
1455    default:
1456      assert (!IS_EPSILON_NODE (node->token.type));
1457      dfa->nexts[idx] = node->next->node_idx;
1458      break;
1459    }
1460
1461  return err;
1462}
1463
1464/* Duplicate the epsilon closure of the node ROOT_NODE.
1465   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466   to their own constraint.  */
1467
1468static reg_errcode_t
1469internal_function
1470duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1471			int root_node, unsigned int init_constraint)
1472{
1473  int org_node, clone_node, ret;
1474  unsigned int constraint = init_constraint;
1475  for (org_node = top_org_node, clone_node = top_clone_node;;)
1476    {
1477      int org_dest, clone_dest;
1478      if (dfa->nodes[org_node].type == OP_BACK_REF)
1479	{
1480	  /* If the back reference epsilon-transit, its destination must
1481	     also have the constraint.  Then duplicate the epsilon closure
1482	     of the destination of the back reference, and store it in
1483	     edests of the back reference.  */
1484	  org_dest = dfa->nexts[org_node];
1485	  re_node_set_empty (dfa->edests + clone_node);
1486	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1487	  if (BE (clone_dest == -1, 0))
1488	    return REG_ESPACE;
1489	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1490	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491	  if (BE (ret < 0, 0))
1492	    return REG_ESPACE;
1493	}
1494      else if (dfa->edests[org_node].nelem == 0)
1495	{
1496	  /* In case of the node can't epsilon-transit, don't duplicate the
1497	     destination and store the original destination as the
1498	     destination of the node.  */
1499	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1500	  break;
1501	}
1502      else if (dfa->edests[org_node].nelem == 1)
1503	{
1504	  /* In case of the node can epsilon-transit, and it has only one
1505	     destination.  */
1506	  org_dest = dfa->edests[org_node].elems[0];
1507	  re_node_set_empty (dfa->edests + clone_node);
1508	  /* If the node is root_node itself, it means the epsilon clsoure
1509	     has a loop.   Then tie it to the destination of the root_node.  */
1510	  if (org_node == root_node && clone_node != org_node)
1511	    {
1512	      ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1513	      if (BE (ret < 0, 0))
1514		return REG_ESPACE;
1515	      break;
1516	    }
1517	  /* In case of the node has another constraint, add it.  */
1518	  constraint |= dfa->nodes[org_node].constraint;
1519	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1520	  if (BE (clone_dest == -1, 0))
1521	    return REG_ESPACE;
1522	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1523	  if (BE (ret < 0, 0))
1524	    return REG_ESPACE;
1525	}
1526      else /* dfa->edests[org_node].nelem == 2 */
1527	{
1528	  /* In case of the node can epsilon-transit, and it has two
1529	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1530	  org_dest = dfa->edests[org_node].elems[0];
1531	  re_node_set_empty (dfa->edests + clone_node);
1532	  /* Search for a duplicated node which satisfies the constraint.  */
1533	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1534	  if (clone_dest == -1)
1535	    {
1536	      /* There is no such duplicated node, create a new one.  */
1537	      reg_errcode_t err;
1538	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1539	      if (BE (clone_dest == -1, 0))
1540		return REG_ESPACE;
1541	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542	      if (BE (ret < 0, 0))
1543		return REG_ESPACE;
1544	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545					    root_node, constraint);
1546	      if (BE (err != REG_NOERROR, 0))
1547		return err;
1548	    }
1549	  else
1550	    {
1551	      /* There is a duplicated node which satisfies the constraint,
1552		 use it to avoid infinite loop.  */
1553	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554	      if (BE (ret < 0, 0))
1555		return REG_ESPACE;
1556	    }
1557
1558	  org_dest = dfa->edests[org_node].elems[1];
1559	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1560	  if (BE (clone_dest == -1, 0))
1561	    return REG_ESPACE;
1562	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563	  if (BE (ret < 0, 0))
1564	    return REG_ESPACE;
1565	}
1566      org_node = org_dest;
1567      clone_node = clone_dest;
1568    }
1569  return REG_NOERROR;
1570}
1571
1572/* Search for a node which is duplicated from the node ORG_NODE, and
1573   satisfies the constraint CONSTRAINT.  */
1574
1575static int
1576search_duplicated_node (const re_dfa_t *dfa, int org_node,
1577			unsigned int constraint)
1578{
1579  int idx;
1580  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1581    {
1582      if (org_node == dfa->org_indices[idx]
1583	  && constraint == dfa->nodes[idx].constraint)
1584	return idx; /* Found.  */
1585    }
1586  return -1; /* Not found.  */
1587}
1588
1589/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1590   Return the index of the new node, or -1 if insufficient storage is
1591   available.  */
1592
1593static int
1594duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1595{
1596  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597  if (BE (dup_idx != -1, 1))
1598    {
1599      dfa->nodes[dup_idx].constraint = constraint;
1600      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1601      dfa->nodes[dup_idx].duplicated = 1;
1602
1603      /* Store the index of the original node.  */
1604      dfa->org_indices[dup_idx] = org_idx;
1605    }
1606  return dup_idx;
1607}
1608
1609static reg_errcode_t
1610calc_inveclosure (re_dfa_t *dfa)
1611{
1612  int src, idx, ret;
1613  for (idx = 0; idx < dfa->nodes_len; ++idx)
1614    re_node_set_init_empty (dfa->inveclosures + idx);
1615
1616  for (src = 0; src < dfa->nodes_len; ++src)
1617    {
1618      int *elems = dfa->eclosures[src].elems;
1619      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1620	{
1621	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1622	  if (BE (ret == -1, 0))
1623	    return REG_ESPACE;
1624	}
1625    }
1626
1627  return REG_NOERROR;
1628}
1629
1630/* Calculate "eclosure" for all the node in DFA.  */
1631
1632static reg_errcode_t
1633calc_eclosure (re_dfa_t *dfa)
1634{
1635  int node_idx, incomplete;
1636#ifdef DEBUG
1637  assert (dfa->nodes_len > 0);
1638#endif
1639  incomplete = 0;
1640  /* For each nodes, calculate epsilon closure.  */
1641  for (node_idx = 0; ; ++node_idx)
1642    {
1643      reg_errcode_t err;
1644      re_node_set eclosure_elem;
1645      if (node_idx == dfa->nodes_len)
1646	{
1647	  if (!incomplete)
1648	    break;
1649	  incomplete = 0;
1650	  node_idx = 0;
1651	}
1652
1653#ifdef DEBUG
1654      assert (dfa->eclosures[node_idx].nelem != -1);
1655#endif
1656
1657      /* If we have already calculated, skip it.  */
1658      if (dfa->eclosures[node_idx].nelem != 0)
1659	continue;
1660      /* Calculate epsilon closure of `node_idx'.  */
1661      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1662      if (BE (err != REG_NOERROR, 0))
1663	return err;
1664
1665      if (dfa->eclosures[node_idx].nelem == 0)
1666	{
1667	  incomplete = 1;
1668	  re_node_set_free (&eclosure_elem);
1669	}
1670    }
1671  return REG_NOERROR;
1672}
1673
1674/* Calculate epsilon closure of NODE.  */
1675
1676static reg_errcode_t
1677calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1678{
1679  reg_errcode_t err;
1680  int i;
1681  re_node_set eclosure;
1682  int ret;
1683  int incomplete = 0;
1684  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1685  if (BE (err != REG_NOERROR, 0))
1686    return err;
1687
1688  /* This indicates that we are calculating this node now.
1689     We reference this value to avoid infinite loop.  */
1690  dfa->eclosures[node].nelem = -1;
1691
1692  /* If the current node has constraints, duplicate all nodes
1693     since they must inherit the constraints.  */
1694  if (dfa->nodes[node].constraint
1695      && dfa->edests[node].nelem
1696      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1697    {
1698      err = duplicate_node_closure (dfa, node, node, node,
1699				    dfa->nodes[node].constraint);
1700      if (BE (err != REG_NOERROR, 0))
1701	return err;
1702    }
1703
1704  /* Expand each epsilon destination nodes.  */
1705  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1706    for (i = 0; i < dfa->edests[node].nelem; ++i)
1707      {
1708	re_node_set eclosure_elem;
1709	int edest = dfa->edests[node].elems[i];
1710	/* If calculating the epsilon closure of `edest' is in progress,
1711	   return intermediate result.  */
1712	if (dfa->eclosures[edest].nelem == -1)
1713	  {
1714	    incomplete = 1;
1715	    continue;
1716	  }
1717	/* If we haven't calculated the epsilon closure of `edest' yet,
1718	   calculate now. Otherwise use calculated epsilon closure.  */
1719	if (dfa->eclosures[edest].nelem == 0)
1720	  {
1721	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1722	    if (BE (err != REG_NOERROR, 0))
1723	      return err;
1724	  }
1725	else
1726	  eclosure_elem = dfa->eclosures[edest];
1727	/* Merge the epsilon closure of `edest'.  */
1728	err = re_node_set_merge (&eclosure, &eclosure_elem);
1729	if (BE (err != REG_NOERROR, 0))
1730	  return err;
1731	/* If the epsilon closure of `edest' is incomplete,
1732	   the epsilon closure of this node is also incomplete.  */
1733	if (dfa->eclosures[edest].nelem == 0)
1734	  {
1735	    incomplete = 1;
1736	    re_node_set_free (&eclosure_elem);
1737	  }
1738      }
1739
1740  /* An epsilon closure includes itself.  */
1741  ret = re_node_set_insert (&eclosure, node);
1742  if (BE (ret < 0, 0))
1743    return REG_ESPACE;
1744  if (incomplete && !root)
1745    dfa->eclosures[node].nelem = 0;
1746  else
1747    dfa->eclosures[node] = eclosure;
1748  *new_set = eclosure;
1749  return REG_NOERROR;
1750}
1751
1752/* Functions for token which are used in the parser.  */
1753
1754/* Fetch a token from INPUT.
1755   We must not use this function inside bracket expressions.  */
1756
1757static void
1758internal_function
1759fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1760{
1761  re_string_skip_bytes (input, peek_token (result, input, syntax));
1762}
1763
1764/* Peek a token from INPUT, and return the length of the token.
1765   We must not use this function inside bracket expressions.  */
1766
1767static int
1768internal_function
1769peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1770{
1771  unsigned char c;
1772
1773  if (re_string_eoi (input))
1774    {
1775      token->type = END_OF_RE;
1776      return 0;
1777    }
1778
1779  c = re_string_peek_byte (input, 0);
1780  token->opr.c = c;
1781
1782  token->word_char = 0;
1783#ifdef RE_ENABLE_I18N
1784  token->mb_partial = 0;
1785  if (input->mb_cur_max > 1 &&
1786      !re_string_first_byte (input, re_string_cur_idx (input)))
1787    {
1788      token->type = CHARACTER;
1789      token->mb_partial = 1;
1790      return 1;
1791    }
1792#endif
1793  if (c == '\\')
1794    {
1795      unsigned char c2;
1796      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1797	{
1798	  token->type = BACK_SLASH;
1799	  return 1;
1800	}
1801
1802      c2 = re_string_peek_byte_case (input, 1);
1803      token->opr.c = c2;
1804      token->type = CHARACTER;
1805#ifdef RE_ENABLE_I18N
1806      if (input->mb_cur_max > 1)
1807	{
1808	  wint_t wc = re_string_wchar_at (input,
1809					  re_string_cur_idx (input) + 1);
1810	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1811	}
1812      else
1813#endif
1814	token->word_char = IS_WORD_CHAR (c2) != 0;
1815
1816      switch (c2)
1817	{
1818	case '|':
1819	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1820	    token->type = OP_ALT;
1821	  break;
1822	case '1': case '2': case '3': case '4': case '5':
1823	case '6': case '7': case '8': case '9':
1824	  if (!(syntax & RE_NO_BK_REFS))
1825	    {
1826	      token->type = OP_BACK_REF;
1827	      token->opr.idx = c2 - '1';
1828	    }
1829	  break;
1830	case '<':
1831	  if (!(syntax & RE_NO_GNU_OPS))
1832	    {
1833	      token->type = ANCHOR;
1834	      token->opr.ctx_type = WORD_FIRST;
1835	    }
1836	  break;
1837	case '>':
1838	  if (!(syntax & RE_NO_GNU_OPS))
1839	    {
1840	      token->type = ANCHOR;
1841	      token->opr.ctx_type = WORD_LAST;
1842	    }
1843	  break;
1844	case 'b':
1845	  if (!(syntax & RE_NO_GNU_OPS))
1846	    {
1847	      token->type = ANCHOR;
1848	      token->opr.ctx_type = WORD_DELIM;
1849	    }
1850	  break;
1851	case 'B':
1852	  if (!(syntax & RE_NO_GNU_OPS))
1853	    {
1854	      token->type = ANCHOR;
1855	      token->opr.ctx_type = NOT_WORD_DELIM;
1856	    }
1857	  break;
1858	case 'w':
1859	  if (!(syntax & RE_NO_GNU_OPS))
1860	    token->type = OP_WORD;
1861	  break;
1862	case 'W':
1863	  if (!(syntax & RE_NO_GNU_OPS))
1864	    token->type = OP_NOTWORD;
1865	  break;
1866	case 's':
1867	  if (!(syntax & RE_NO_GNU_OPS))
1868	    token->type = OP_SPACE;
1869	  break;
1870	case 'S':
1871	  if (!(syntax & RE_NO_GNU_OPS))
1872	    token->type = OP_NOTSPACE;
1873	  break;
1874	case '`':
1875	  if (!(syntax & RE_NO_GNU_OPS))
1876	    {
1877	      token->type = ANCHOR;
1878	      token->opr.ctx_type = BUF_FIRST;
1879	    }
1880	  break;
1881	case '\'':
1882	  if (!(syntax & RE_NO_GNU_OPS))
1883	    {
1884	      token->type = ANCHOR;
1885	      token->opr.ctx_type = BUF_LAST;
1886	    }
1887	  break;
1888	case '(':
1889	  if (!(syntax & RE_NO_BK_PARENS))
1890	    token->type = OP_OPEN_SUBEXP;
1891	  break;
1892	case ')':
1893	  if (!(syntax & RE_NO_BK_PARENS))
1894	    token->type = OP_CLOSE_SUBEXP;
1895	  break;
1896	case '+':
1897	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898	    token->type = OP_DUP_PLUS;
1899	  break;
1900	case '?':
1901	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902	    token->type = OP_DUP_QUESTION;
1903	  break;
1904	case '{':
1905	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906	    token->type = OP_OPEN_DUP_NUM;
1907	  break;
1908	case '}':
1909	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910	    token->type = OP_CLOSE_DUP_NUM;
1911	  break;
1912	default:
1913	  break;
1914	}
1915      return 2;
1916    }
1917
1918  token->type = CHARACTER;
1919#ifdef RE_ENABLE_I18N
1920  if (input->mb_cur_max > 1)
1921    {
1922      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1923      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1924    }
1925  else
1926#endif
1927    token->word_char = IS_WORD_CHAR (token->opr.c);
1928
1929  switch (c)
1930    {
1931    case '\n':
1932      if (syntax & RE_NEWLINE_ALT)
1933	token->type = OP_ALT;
1934      break;
1935    case '|':
1936      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1937	token->type = OP_ALT;
1938      break;
1939    case '*':
1940      token->type = OP_DUP_ASTERISK;
1941      break;
1942    case '+':
1943      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944	token->type = OP_DUP_PLUS;
1945      break;
1946    case '?':
1947      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948	token->type = OP_DUP_QUESTION;
1949      break;
1950    case '{':
1951      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952	token->type = OP_OPEN_DUP_NUM;
1953      break;
1954    case '}':
1955      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956	token->type = OP_CLOSE_DUP_NUM;
1957      break;
1958    case '(':
1959      if (syntax & RE_NO_BK_PARENS)
1960	token->type = OP_OPEN_SUBEXP;
1961      break;
1962    case ')':
1963      if (syntax & RE_NO_BK_PARENS)
1964	token->type = OP_CLOSE_SUBEXP;
1965      break;
1966    case '[':
1967      token->type = OP_OPEN_BRACKET;
1968      break;
1969    case '.':
1970      token->type = OP_PERIOD;
1971      break;
1972    case '^':
1973      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1974	  re_string_cur_idx (input) != 0)
1975	{
1976	  char prev = re_string_peek_byte (input, -1);
1977	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1978	    break;
1979	}
1980      token->type = ANCHOR;
1981      token->opr.ctx_type = LINE_FIRST;
1982      break;
1983    case '$':
1984      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1985	  re_string_cur_idx (input) + 1 != re_string_length (input))
1986	{
1987	  re_token_t next;
1988	  re_string_skip_bytes (input, 1);
1989	  peek_token (&next, input, syntax);
1990	  re_string_skip_bytes (input, -1);
1991	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1992	    break;
1993	}
1994      token->type = ANCHOR;
1995      token->opr.ctx_type = LINE_LAST;
1996      break;
1997    default:
1998      break;
1999    }
2000  return 1;
2001}
2002
2003/* Peek a token from INPUT, and return the length of the token.
2004   We must not use this function out of bracket expressions.  */
2005
2006static int
2007internal_function
2008peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2009{
2010  unsigned char c;
2011  if (re_string_eoi (input))
2012    {
2013      token->type = END_OF_RE;
2014      return 0;
2015    }
2016  c = re_string_peek_byte (input, 0);
2017  token->opr.c = c;
2018
2019#ifdef RE_ENABLE_I18N
2020  if (input->mb_cur_max > 1 &&
2021      !re_string_first_byte (input, re_string_cur_idx (input)))
2022    {
2023      token->type = CHARACTER;
2024      return 1;
2025    }
2026#endif /* RE_ENABLE_I18N */
2027
2028  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2029      && re_string_cur_idx (input) + 1 < re_string_length (input))
2030    {
2031      /* In this case, '\' escape a character.  */
2032      unsigned char c2;
2033      re_string_skip_bytes (input, 1);
2034      c2 = re_string_peek_byte (input, 0);
2035      token->opr.c = c2;
2036      token->type = CHARACTER;
2037      return 1;
2038    }
2039  if (c == '[') /* '[' is a special char in a bracket exps.  */
2040    {
2041      unsigned char c2;
2042      int token_len;
2043      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2044	c2 = re_string_peek_byte (input, 1);
2045      else
2046	c2 = 0;
2047      token->opr.c = c2;
2048      token_len = 2;
2049      switch (c2)
2050	{
2051	case '.':
2052	  token->type = OP_OPEN_COLL_ELEM;
2053	  break;
2054	case '=':
2055	  token->type = OP_OPEN_EQUIV_CLASS;
2056	  break;
2057	case ':':
2058	  if (syntax & RE_CHAR_CLASSES)
2059	    {
2060	      token->type = OP_OPEN_CHAR_CLASS;
2061	      break;
2062	    }
2063	  /* else fall through.  */
2064	default:
2065	  token->type = CHARACTER;
2066	  token->opr.c = c;
2067	  token_len = 1;
2068	  break;
2069	}
2070      return token_len;
2071    }
2072  switch (c)
2073    {
2074    case '-':
2075      token->type = OP_CHARSET_RANGE;
2076      break;
2077    case ']':
2078      token->type = OP_CLOSE_BRACKET;
2079      break;
2080    case '^':
2081      token->type = OP_NON_MATCH_LIST;
2082      break;
2083    default:
2084      token->type = CHARACTER;
2085    }
2086  return 1;
2087}
2088
2089/* Functions for parser.  */
2090
2091/* Entry point of the parser.
2092   Parse the regular expression REGEXP and return the structure tree.
2093   If an error is occured, ERR is set by error code, and return NULL.
2094   This function build the following tree, from regular expression <reg_exp>:
2095	   CAT
2096	   / \
2097	  /   \
2098   <reg_exp>  EOR
2099
2100   CAT means concatenation.
2101   EOR means end of regular expression.  */
2102
2103static bin_tree_t *
2104parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2105       reg_errcode_t *err)
2106{
2107  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2108  bin_tree_t *tree, *eor, *root;
2109  re_token_t current_token;
2110  dfa->syntax = syntax;
2111  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2112  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2113  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114    return NULL;
2115  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2116  if (tree != NULL)
2117    root = create_tree (dfa, tree, eor, CONCAT);
2118  else
2119    root = eor;
2120  if (BE (eor == NULL || root == NULL, 0))
2121    {
2122      *err = REG_ESPACE;
2123      return NULL;
2124    }
2125  return root;
2126}
2127
2128/* This function build the following tree, from regular expression
2129   <branch1>|<branch2>:
2130	   ALT
2131	   / \
2132	  /   \
2133   <branch1> <branch2>
2134
2135   ALT means alternative, which represents the operator `|'.  */
2136
2137static bin_tree_t *
2138parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2139	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2140{
2141  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2142  bin_tree_t *tree, *branch = NULL;
2143  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2144  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2145    return NULL;
2146
2147  while (token->type == OP_ALT)
2148    {
2149      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2150      if (token->type != OP_ALT && token->type != END_OF_RE
2151	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2152	{
2153	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2154	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2155	    return NULL;
2156	}
2157      else
2158	branch = NULL;
2159      tree = create_tree (dfa, tree, branch, OP_ALT);
2160      if (BE (tree == NULL, 0))
2161	{
2162	  *err = REG_ESPACE;
2163	  return NULL;
2164	}
2165    }
2166  return tree;
2167}
2168
2169/* This function build the following tree, from regular expression
2170   <exp1><exp2>:
2171	CAT
2172	/ \
2173       /   \
2174   <exp1> <exp2>
2175
2176   CAT means concatenation.  */
2177
2178static bin_tree_t *
2179parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2180	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2181{
2182  bin_tree_t *tree, *exp;
2183  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2185  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186    return NULL;
2187
2188  while (token->type != OP_ALT && token->type != END_OF_RE
2189	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2190    {
2191      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2192      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2193	{
2194	  if (tree != NULL)
2195	    postorder (tree, free_tree, NULL);
2196	  return NULL;
2197	}
2198      if (tree != NULL && exp != NULL)
2199	{
2200	  bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2201	  if (newtree == NULL)
2202	    {
2203	      postorder (exp, free_tree, NULL);
2204	      postorder (tree, free_tree, NULL);
2205	      *err = REG_ESPACE;
2206	      return NULL;
2207	    }
2208	  tree = newtree;
2209	}
2210      else if (tree == NULL)
2211	tree = exp;
2212      /* Otherwise exp == NULL, we don't need to create new tree.  */
2213    }
2214  return tree;
2215}
2216
2217/* This function build the following tree, from regular expression a*:
2218	 *
2219	 |
2220	 a
2221*/
2222
2223static bin_tree_t *
2224parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2226{
2227  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228  bin_tree_t *tree;
2229  switch (token->type)
2230    {
2231    case CHARACTER:
2232      tree = create_token_tree (dfa, NULL, NULL, token);
2233      if (BE (tree == NULL, 0))
2234	{
2235	  *err = REG_ESPACE;
2236	  return NULL;
2237	}
2238#ifdef RE_ENABLE_I18N
2239      if (dfa->mb_cur_max > 1)
2240	{
2241	  while (!re_string_eoi (regexp)
2242		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243	    {
2244	      bin_tree_t *mbc_remain;
2245	      fetch_token (token, regexp, syntax);
2246	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2249		{
2250		  *err = REG_ESPACE;
2251		  return NULL;
2252		}
2253	    }
2254	}
2255#endif
2256      break;
2257    case OP_OPEN_SUBEXP:
2258      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260	return NULL;
2261      break;
2262    case OP_OPEN_BRACKET:
2263      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265	return NULL;
2266      break;
2267    case OP_BACK_REF:
2268      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269	{
2270	  *err = REG_ESUBREG;
2271	  return NULL;
2272	}
2273      dfa->used_bkref_map |= 1 << token->opr.idx;
2274      tree = create_token_tree (dfa, NULL, NULL, token);
2275      if (BE (tree == NULL, 0))
2276	{
2277	  *err = REG_ESPACE;
2278	  return NULL;
2279	}
2280      ++dfa->nbackref;
2281      dfa->has_mb_node = 1;
2282      break;
2283    case OP_OPEN_DUP_NUM:
2284      if (syntax & RE_CONTEXT_INVALID_DUP)
2285	{
2286	  *err = REG_BADRPT;
2287	  return NULL;
2288	}
2289      /* FALLTHROUGH */
2290    case OP_DUP_ASTERISK:
2291    case OP_DUP_PLUS:
2292    case OP_DUP_QUESTION:
2293      if (syntax & RE_CONTEXT_INVALID_OPS)
2294	{
2295	  *err = REG_BADRPT;
2296	  return NULL;
2297	}
2298      else if (syntax & RE_CONTEXT_INDEP_OPS)
2299	{
2300	  fetch_token (token, regexp, syntax);
2301	  return parse_expression (regexp, preg, token, syntax, nest, err);
2302	}
2303      /* else fall through  */
2304    case OP_CLOSE_SUBEXP:
2305      if ((token->type == OP_CLOSE_SUBEXP) &&
2306	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307	{
2308	  *err = REG_ERPAREN;
2309	  return NULL;
2310	}
2311      /* else fall through  */
2312    case OP_CLOSE_DUP_NUM:
2313      /* We treat it as a normal character.  */
2314
2315      /* Then we can these characters as normal characters.  */
2316      token->type = CHARACTER;
2317      /* mb_partial and word_char bits should be initialized already
2318	 by peek_token.  */
2319      tree = create_token_tree (dfa, NULL, NULL, token);
2320      if (BE (tree == NULL, 0))
2321	{
2322	  *err = REG_ESPACE;
2323	  return NULL;
2324	}
2325      break;
2326    case ANCHOR:
2327      if ((token->opr.ctx_type
2328	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329	  && dfa->word_ops_used == 0)
2330	init_word_char (dfa);
2331      if (token->opr.ctx_type == WORD_DELIM
2332	  || token->opr.ctx_type == NOT_WORD_DELIM)
2333	{
2334	  bin_tree_t *tree_first, *tree_last;
2335	  if (token->opr.ctx_type == WORD_DELIM)
2336	    {
2337	      token->opr.ctx_type = WORD_FIRST;
2338	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2339	      token->opr.ctx_type = WORD_LAST;
2340	    }
2341	  else
2342	    {
2343	      token->opr.ctx_type = INSIDE_WORD;
2344	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2345	      token->opr.ctx_type = INSIDE_NOTWORD;
2346	    }
2347	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2348	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350	    {
2351	      *err = REG_ESPACE;
2352	      return NULL;
2353	    }
2354	}
2355      else
2356	{
2357	  tree = create_token_tree (dfa, NULL, NULL, token);
2358	  if (BE (tree == NULL, 0))
2359	    {
2360	      *err = REG_ESPACE;
2361	      return NULL;
2362	    }
2363	}
2364      /* We must return here, since ANCHORs can't be followed
2365	 by repetition operators.
2366	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2368      fetch_token (token, regexp, syntax);
2369      return tree;
2370    case OP_PERIOD:
2371      tree = create_token_tree (dfa, NULL, NULL, token);
2372      if (BE (tree == NULL, 0))
2373	{
2374	  *err = REG_ESPACE;
2375	  return NULL;
2376	}
2377      if (dfa->mb_cur_max > 1)
2378	dfa->has_mb_node = 1;
2379      break;
2380    case OP_WORD:
2381    case OP_NOTWORD:
2382      tree = build_charclass_op (dfa, regexp->trans,
2383				 (const unsigned char *) "alnum",
2384				 (const unsigned char *) "_",
2385				 token->type == OP_NOTWORD, err);
2386      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387	return NULL;
2388      break;
2389    case OP_SPACE:
2390    case OP_NOTSPACE:
2391      tree = build_charclass_op (dfa, regexp->trans,
2392				 (const unsigned char *) "space",
2393				 (const unsigned char *) "",
2394				 token->type == OP_NOTSPACE, err);
2395      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396	return NULL;
2397      break;
2398    case OP_ALT:
2399    case END_OF_RE:
2400      return NULL;
2401    case BACK_SLASH:
2402      *err = REG_EESCAPE;
2403      return NULL;
2404    default:
2405      /* Must not happen?  */
2406#ifdef DEBUG
2407      assert (0);
2408#endif
2409      return NULL;
2410    }
2411  fetch_token (token, regexp, syntax);
2412
2413  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415    {
2416      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418	return NULL;
2419      /* In BRE consecutive duplications are not allowed.  */
2420      if ((syntax & RE_CONTEXT_INVALID_DUP)
2421	  && (token->type == OP_DUP_ASTERISK
2422	      || token->type == OP_OPEN_DUP_NUM))
2423	{
2424	  *err = REG_BADRPT;
2425	  return NULL;
2426	}
2427    }
2428
2429  return tree;
2430}
2431
2432/* This function build the following tree, from regular expression
2433   (<reg_exp>):
2434	 SUBEXP
2435	    |
2436	<reg_exp>
2437*/
2438
2439static bin_tree_t *
2440parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2442{
2443  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444  bin_tree_t *tree;
2445  size_t cur_nsub;
2446  cur_nsub = preg->re_nsub++;
2447
2448  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449
2450  /* The subexpression may be a null string.  */
2451  if (token->type == OP_CLOSE_SUBEXP)
2452    tree = NULL;
2453  else
2454    {
2455      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457	{
2458	  if (tree != NULL)
2459	    postorder (tree, free_tree, NULL);
2460	  *err = REG_EPAREN;
2461	}
2462      if (BE (*err != REG_NOERROR, 0))
2463	return NULL;
2464    }
2465
2466  if (cur_nsub <= '9' - '1')
2467    dfa->completed_bkref_map |= 1 << cur_nsub;
2468
2469  tree = create_tree (dfa, tree, NULL, SUBEXP);
2470  if (BE (tree == NULL, 0))
2471    {
2472      *err = REG_ESPACE;
2473      return NULL;
2474    }
2475  tree->token.opr.idx = cur_nsub;
2476  return tree;
2477}
2478
2479/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2480
2481static bin_tree_t *
2482parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2483	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2484{
2485  bin_tree_t *tree = NULL, *old_tree = NULL;
2486  int i, start, end, start_idx = re_string_cur_idx (regexp);
2487  re_token_t start_token = *token;
2488
2489  if (token->type == OP_OPEN_DUP_NUM)
2490    {
2491      end = 0;
2492      start = fetch_number (regexp, token, syntax);
2493      if (start == -1)
2494	{
2495	  if (token->type == CHARACTER && token->opr.c == ',')
2496	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2497	  else
2498	    {
2499	      *err = REG_BADBR; /* <re>{} is invalid.  */
2500	      return NULL;
2501	    }
2502	}
2503      if (BE (start != -2, 1))
2504	{
2505	  /* We treat "{n}" as "{n,n}".  */
2506	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2507		 : ((token->type == CHARACTER && token->opr.c == ',')
2508		    ? fetch_number (regexp, token, syntax) : -2));
2509	}
2510      if (BE (start == -2 || end == -2, 0))
2511	{
2512	  /* Invalid sequence.  */
2513	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2514	    {
2515	      if (token->type == END_OF_RE)
2516		*err = REG_EBRACE;
2517	      else
2518		*err = REG_BADBR;
2519
2520	      return NULL;
2521	    }
2522
2523	  /* If the syntax bit is set, rollback.  */
2524	  re_string_set_index (regexp, start_idx);
2525	  *token = start_token;
2526	  token->type = CHARACTER;
2527	  /* mb_partial and word_char bits should be already initialized by
2528	     peek_token.  */
2529	  return elem;
2530	}
2531
2532      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2533	{
2534	  /* First number greater than second.  */
2535	  *err = REG_BADBR;
2536	  return NULL;
2537	}
2538    }
2539  else
2540    {
2541      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2542      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2543    }
2544
2545  fetch_token (token, regexp, syntax);
2546
2547  if (BE (elem == NULL, 0))
2548    return NULL;
2549  if (BE (start == 0 && end == 0, 0))
2550    {
2551      postorder (elem, free_tree, NULL);
2552      return NULL;
2553    }
2554
2555  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2556  if (BE (start > 0, 0))
2557    {
2558      tree = elem;
2559      for (i = 2; i <= start; ++i)
2560	{
2561	  elem = duplicate_tree (elem, dfa);
2562	  tree = create_tree (dfa, tree, elem, CONCAT);
2563	  if (BE (elem == NULL || tree == NULL, 0))
2564	    goto parse_dup_op_espace;
2565	}
2566
2567      if (start == end)
2568	return tree;
2569
2570      /* Duplicate ELEM before it is marked optional.  */
2571      elem = duplicate_tree (elem, dfa);
2572      old_tree = tree;
2573    }
2574  else
2575    old_tree = NULL;
2576
2577  if (elem->token.type == SUBEXP)
2578    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2579
2580  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2581  if (BE (tree == NULL, 0))
2582    goto parse_dup_op_espace;
2583
2584  /* This loop is actually executed only when end != -1,
2585     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2586     already created the start+1-th copy.  */
2587  for (i = start + 2; i <= end; ++i)
2588    {
2589      elem = duplicate_tree (elem, dfa);
2590      tree = create_tree (dfa, tree, elem, CONCAT);
2591      if (BE (elem == NULL || tree == NULL, 0))
2592	goto parse_dup_op_espace;
2593
2594      tree = create_tree (dfa, tree, NULL, OP_ALT);
2595      if (BE (tree == NULL, 0))
2596	goto parse_dup_op_espace;
2597    }
2598
2599  if (old_tree)
2600    tree = create_tree (dfa, old_tree, tree, CONCAT);
2601
2602  return tree;
2603
2604 parse_dup_op_espace:
2605  *err = REG_ESPACE;
2606  return NULL;
2607}
2608
2609/* Size of the names for collating symbol/equivalence_class/character_class.
2610   I'm not sure, but maybe enough.  */
2611#define BRACKET_NAME_BUF_SIZE 32
2612
2613#ifndef _LIBC
2614  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615     Build the range expression which starts from START_ELEM, and ends
2616     at END_ELEM.  The result are written to MBCSET and SBCSET.
2617     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618     mbcset->range_ends, is a pointer argument sinse we may
2619     update it.  */
2620
2621static reg_errcode_t
2622internal_function
2623# ifdef RE_ENABLE_I18N
2624build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2625		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2626# else /* not RE_ENABLE_I18N */
2627build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2628		 bracket_elem_t *end_elem)
2629# endif /* not RE_ENABLE_I18N */
2630{
2631  unsigned int start_ch, end_ch;
2632  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2633  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2634	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2635	  0))
2636    return REG_ERANGE;
2637
2638  /* We can handle no multi character collating elements without libc
2639     support.  */
2640  if (BE ((start_elem->type == COLL_SYM
2641	   && strlen ((char *) start_elem->opr.name) > 1)
2642	  || (end_elem->type == COLL_SYM
2643	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2644    return REG_ECOLLATE;
2645
2646# ifdef RE_ENABLE_I18N
2647  {
2648    wchar_t wc;
2649    wint_t start_wc;
2650    wint_t end_wc;
2651    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2652
2653    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2654		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2655		   : 0));
2656    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2657	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2658		 : 0));
2659    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2660		? __btowc (start_ch) : start_elem->opr.wch);
2661    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2662	      ? __btowc (end_ch) : end_elem->opr.wch);
2663    if (start_wc == WEOF || end_wc == WEOF)
2664      return REG_ECOLLATE;
2665    cmp_buf[0] = start_wc;
2666    cmp_buf[4] = end_wc;
2667    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2668      return REG_ERANGE;
2669
2670    /* Got valid collation sequence values, add them as a new entry.
2671       However, for !_LIBC we have no collation elements: if the
2672       character set is single byte, the single byte character set
2673       that we build below suffices.  parse_bracket_exp passes
2674       no MBCSET if dfa->mb_cur_max == 1.  */
2675    if (mbcset)
2676      {
2677	/* Check the space of the arrays.  */
2678	if (BE (*range_alloc == mbcset->nranges, 0))
2679	  {
2680	    /* There is not enough space, need realloc.  */
2681	    wchar_t *new_array_start, *new_array_end;
2682	    int new_nranges;
2683
2684	    /* +1 in case of mbcset->nranges is 0.  */
2685	    new_nranges = 2 * mbcset->nranges + 1;
2686	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2687	       are NULL if *range_alloc == 0.  */
2688	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2689					  new_nranges);
2690	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2691					new_nranges);
2692
2693	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2694	      return REG_ESPACE;
2695
2696	    mbcset->range_starts = new_array_start;
2697	    mbcset->range_ends = new_array_end;
2698	    *range_alloc = new_nranges;
2699	  }
2700
2701	mbcset->range_starts[mbcset->nranges] = start_wc;
2702	mbcset->range_ends[mbcset->nranges++] = end_wc;
2703      }
2704
2705    /* Build the table for single byte characters.  */
2706    for (wc = 0; wc < SBC_MAX; ++wc)
2707      {
2708	cmp_buf[2] = wc;
2709	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2710	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2711	  bitset_set (sbcset, wc);
2712      }
2713  }
2714# else /* not RE_ENABLE_I18N */
2715  {
2716    unsigned int ch;
2717    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2718		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719		   : 0));
2720    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2721	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722		 : 0));
2723    if (start_ch > end_ch)
2724      return REG_ERANGE;
2725    /* Build the table for single byte characters.  */
2726    for (ch = 0; ch < SBC_MAX; ++ch)
2727      if (start_ch <= ch  && ch <= end_ch)
2728	bitset_set (sbcset, ch);
2729  }
2730# endif /* not RE_ENABLE_I18N */
2731  return REG_NOERROR;
2732}
2733#endif /* not _LIBC */
2734
2735#ifndef _LIBC
2736/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2737   Build the collating element which is represented by NAME.
2738   The result are written to MBCSET and SBCSET.
2739   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2740   pointer argument since we may update it.  */
2741
2742static reg_errcode_t
2743internal_function
2744# ifdef RE_ENABLE_I18N
2745build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2746			int *coll_sym_alloc, const unsigned char *name)
2747# else /* not RE_ENABLE_I18N */
2748build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2749# endif /* not RE_ENABLE_I18N */
2750{
2751  size_t name_len = strlen ((const char *) name);
2752  if (BE (name_len != 1, 0))
2753    return REG_ECOLLATE;
2754  else
2755    {
2756      bitset_set (sbcset, name[0]);
2757      return REG_NOERROR;
2758    }
2759}
2760#endif /* not _LIBC */
2761
2762/* This function parse bracket expression like "[abc]", "[a-c]",
2763   "[[.a-a.]]" etc.  */
2764
2765static bin_tree_t *
2766parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2767		   reg_syntax_t syntax, reg_errcode_t *err)
2768{
2769#ifdef _LIBC
2770  const unsigned char *collseqmb;
2771  const char *collseqwc;
2772  uint32_t nrules;
2773  int32_t table_size;
2774  const int32_t *symb_table;
2775  const unsigned char *extra;
2776
2777  /* Local function for parse_bracket_exp used in _LIBC environement.
2778     Seek the collating symbol entry correspondings to NAME.
2779     Return the index of the symbol in the SYMB_TABLE.  */
2780
2781  auto inline int32_t
2782  __attribute ((always_inline))
2783  seek_collating_symbol_entry (name, name_len)
2784	 const unsigned char *name;
2785	 size_t name_len;
2786    {
2787      int32_t hash = elem_hash ((const char *) name, name_len);
2788      int32_t elem = hash % table_size;
2789      if (symb_table[2 * elem] != 0)
2790	{
2791	  int32_t second = hash % (table_size - 2) + 1;
2792
2793	  do
2794	    {
2795	      /* First compare the hashing value.  */
2796	      if (symb_table[2 * elem] == hash
2797		  /* Compare the length of the name.  */
2798		  && name_len == extra[symb_table[2 * elem + 1]]
2799		  /* Compare the name.  */
2800		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2801			     name_len) == 0)
2802		{
2803		  /* Yep, this is the entry.  */
2804		  break;
2805		}
2806
2807	      /* Next entry.  */
2808	      elem += second;
2809	    }
2810	  while (symb_table[2 * elem] != 0);
2811	}
2812      return elem;
2813    }
2814
2815  /* Local function for parse_bracket_exp used in _LIBC environment.
2816     Look up the collation sequence value of BR_ELEM.
2817     Return the value if succeeded, UINT_MAX otherwise.  */
2818
2819  auto inline unsigned int
2820  __attribute ((always_inline))
2821  lookup_collation_sequence_value (br_elem)
2822	 bracket_elem_t *br_elem;
2823    {
2824      if (br_elem->type == SB_CHAR)
2825	{
2826	  /*
2827	  if (MB_CUR_MAX == 1)
2828	  */
2829	  if (nrules == 0)
2830	    return collseqmb[br_elem->opr.ch];
2831	  else
2832	    {
2833	      wint_t wc = __btowc (br_elem->opr.ch);
2834	      return __collseq_table_lookup (collseqwc, wc);
2835	    }
2836	}
2837      else if (br_elem->type == MB_CHAR)
2838	{
2839	  if (nrules != 0)
2840	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2841	}
2842      else if (br_elem->type == COLL_SYM)
2843	{
2844	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2845	  if (nrules != 0)
2846	    {
2847	      int32_t elem, idx;
2848	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2849						  sym_name_len);
2850	      if (symb_table[2 * elem] != 0)
2851		{
2852		  /* We found the entry.  */
2853		  idx = symb_table[2 * elem + 1];
2854		  /* Skip the name of collating element name.  */
2855		  idx += 1 + extra[idx];
2856		  /* Skip the byte sequence of the collating element.  */
2857		  idx += 1 + extra[idx];
2858		  /* Adjust for the alignment.  */
2859		  idx = (idx + 3) & ~3;
2860		  /* Skip the multibyte collation sequence value.  */
2861		  idx += sizeof (unsigned int);
2862		  /* Skip the wide char sequence of the collating element.  */
2863		  idx += sizeof (unsigned int) *
2864		    (1 + *(unsigned int *) (extra + idx));
2865		  /* Return the collation sequence value.  */
2866		  return *(unsigned int *) (extra + idx);
2867		}
2868	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2869		{
2870		  /* No valid character.  Match it as a single byte
2871		     character.  */
2872		  return collseqmb[br_elem->opr.name[0]];
2873		}
2874	    }
2875	  else if (sym_name_len == 1)
2876	    return collseqmb[br_elem->opr.name[0]];
2877	}
2878      return UINT_MAX;
2879    }
2880
2881  /* Local function for parse_bracket_exp used in _LIBC environement.
2882     Build the range expression which starts from START_ELEM, and ends
2883     at END_ELEM.  The result are written to MBCSET and SBCSET.
2884     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2885     mbcset->range_ends, is a pointer argument sinse we may
2886     update it.  */
2887
2888  auto inline reg_errcode_t
2889  __attribute ((always_inline))
2890  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2891	 re_charset_t *mbcset;
2892	 int *range_alloc;
2893	 bitset_t sbcset;
2894	 bracket_elem_t *start_elem, *end_elem;
2895    {
2896      unsigned int ch;
2897      uint32_t start_collseq;
2898      uint32_t end_collseq;
2899
2900      /* Equivalence Classes and Character Classes can't be a range
2901	 start/end.  */
2902      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2903	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2904	      0))
2905	return REG_ERANGE;
2906
2907      start_collseq = lookup_collation_sequence_value (start_elem);
2908      end_collseq = lookup_collation_sequence_value (end_elem);
2909      /* Check start/end collation sequence values.  */
2910      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2911	return REG_ECOLLATE;
2912      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2913	return REG_ERANGE;
2914
2915      /* Got valid collation sequence values, add them as a new entry.
2916	 However, if we have no collation elements, and the character set
2917	 is single byte, the single byte character set that we
2918	 build below suffices. */
2919      if (nrules > 0 || dfa->mb_cur_max > 1)
2920	{
2921	  /* Check the space of the arrays.  */
2922	  if (BE (*range_alloc == mbcset->nranges, 0))
2923	    {
2924	      /* There is not enough space, need realloc.  */
2925	      uint32_t *new_array_start;
2926	      uint32_t *new_array_end;
2927	      int new_nranges;
2928
2929	      /* +1 in case of mbcset->nranges is 0.  */
2930	      new_nranges = 2 * mbcset->nranges + 1;
2931	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2932					    new_nranges);
2933	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2934					  new_nranges);
2935
2936	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2937		return REG_ESPACE;
2938
2939	      mbcset->range_starts = new_array_start;
2940	      mbcset->range_ends = new_array_end;
2941	      *range_alloc = new_nranges;
2942	    }
2943
2944	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2945	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2946	}
2947
2948      /* Build the table for single byte characters.  */
2949      for (ch = 0; ch < SBC_MAX; ch++)
2950	{
2951	  uint32_t ch_collseq;
2952	  /*
2953	  if (MB_CUR_MAX == 1)
2954	  */
2955	  if (nrules == 0)
2956	    ch_collseq = collseqmb[ch];
2957	  else
2958	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2959	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2960	    bitset_set (sbcset, ch);
2961	}
2962      return REG_NOERROR;
2963    }
2964
2965  /* Local function for parse_bracket_exp used in _LIBC environement.
2966     Build the collating element which is represented by NAME.
2967     The result are written to MBCSET and SBCSET.
2968     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2969     pointer argument sinse we may update it.  */
2970
2971  auto inline reg_errcode_t
2972  __attribute ((always_inline))
2973  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2974	 re_charset_t *mbcset;
2975	 int *coll_sym_alloc;
2976	 bitset_t sbcset;
2977	 const unsigned char *name;
2978    {
2979      int32_t elem, idx;
2980      size_t name_len = strlen ((const char *) name);
2981      if (nrules != 0)
2982	{
2983	  elem = seek_collating_symbol_entry (name, name_len);
2984	  if (symb_table[2 * elem] != 0)
2985	    {
2986	      /* We found the entry.  */
2987	      idx = symb_table[2 * elem + 1];
2988	      /* Skip the name of collating element name.  */
2989	      idx += 1 + extra[idx];
2990	    }
2991	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2992	    {
2993	      /* No valid character, treat it as a normal
2994		 character.  */
2995	      bitset_set (sbcset, name[0]);
2996	      return REG_NOERROR;
2997	    }
2998	  else
2999	    return REG_ECOLLATE;
3000
3001	  /* Got valid collation sequence, add it as a new entry.  */
3002	  /* Check the space of the arrays.  */
3003	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3004	    {
3005	      /* Not enough, realloc it.  */
3006	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3007	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3008	      /* Use realloc since mbcset->coll_syms is NULL
3009		 if *alloc == 0.  */
3010	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3011						   new_coll_sym_alloc);
3012	      if (BE (new_coll_syms == NULL, 0))
3013		return REG_ESPACE;
3014	      mbcset->coll_syms = new_coll_syms;
3015	      *coll_sym_alloc = new_coll_sym_alloc;
3016	    }
3017	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3018	  return REG_NOERROR;
3019	}
3020      else
3021	{
3022	  if (BE (name_len != 1, 0))
3023	    return REG_ECOLLATE;
3024	  else
3025	    {
3026	      bitset_set (sbcset, name[0]);
3027	      return REG_NOERROR;
3028	    }
3029	}
3030    }
3031#endif
3032
3033  re_token_t br_token;
3034  re_bitset_ptr_t sbcset;
3035#ifdef RE_ENABLE_I18N
3036  re_charset_t *mbcset;
3037  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3038  int equiv_class_alloc = 0, char_class_alloc = 0;
3039#endif /* not RE_ENABLE_I18N */
3040  int non_match = 0;
3041  bin_tree_t *work_tree;
3042  int token_len;
3043  int first_round = 1;
3044#ifdef _LIBC
3045  collseqmb = (const unsigned char *)
3046    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3047  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3048  if (nrules)
3049    {
3050      /*
3051      if (MB_CUR_MAX > 1)
3052      */
3053      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3054      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3055      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3056						  _NL_COLLATE_SYMB_TABLEMB);
3057      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3058						   _NL_COLLATE_SYMB_EXTRAMB);
3059    }
3060#endif
3061  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3062#ifdef RE_ENABLE_I18N
3063  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3064#endif /* RE_ENABLE_I18N */
3065#ifdef RE_ENABLE_I18N
3066  if (BE (sbcset == NULL || mbcset == NULL, 0))
3067#else
3068  if (BE (sbcset == NULL, 0))
3069#endif /* RE_ENABLE_I18N */
3070    {
3071      re_free (sbcset);
3072#ifdef RE_ENABLE_I18N
3073      re_free (mbcset);
3074#endif
3075      *err = REG_ESPACE;
3076      return NULL;
3077    }
3078
3079  token_len = peek_token_bracket (token, regexp, syntax);
3080  if (BE (token->type == END_OF_RE, 0))
3081    {
3082      *err = REG_BADPAT;
3083      goto parse_bracket_exp_free_return;
3084    }
3085  if (token->type == OP_NON_MATCH_LIST)
3086    {
3087#ifdef RE_ENABLE_I18N
3088      mbcset->non_match = 1;
3089#endif /* not RE_ENABLE_I18N */
3090      non_match = 1;
3091      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3092	bitset_set (sbcset, '\n');
3093      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3094      token_len = peek_token_bracket (token, regexp, syntax);
3095      if (BE (token->type == END_OF_RE, 0))
3096	{
3097	  *err = REG_BADPAT;
3098	  goto parse_bracket_exp_free_return;
3099	}
3100    }
3101
3102  /* We treat the first ']' as a normal character.  */
3103  if (token->type == OP_CLOSE_BRACKET)
3104    token->type = CHARACTER;
3105
3106  while (1)
3107    {
3108      bracket_elem_t start_elem, end_elem;
3109      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3110      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3111      reg_errcode_t ret;
3112      int token_len2 = 0, is_range_exp = 0;
3113      re_token_t token2;
3114
3115      start_elem.opr.name = start_name_buf;
3116      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3117				   syntax, first_round);
3118      if (BE (ret != REG_NOERROR, 0))
3119	{
3120	  *err = ret;
3121	  goto parse_bracket_exp_free_return;
3122	}
3123      first_round = 0;
3124
3125      /* Get information about the next token.  We need it in any case.  */
3126      token_len = peek_token_bracket (token, regexp, syntax);
3127
3128      /* Do not check for ranges if we know they are not allowed.  */
3129      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3130	{
3131	  if (BE (token->type == END_OF_RE, 0))
3132	    {
3133	      *err = REG_EBRACK;
3134	      goto parse_bracket_exp_free_return;
3135	    }
3136	  if (token->type == OP_CHARSET_RANGE)
3137	    {
3138	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3139	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3140	      if (BE (token2.type == END_OF_RE, 0))
3141		{
3142		  *err = REG_EBRACK;
3143		  goto parse_bracket_exp_free_return;
3144		}
3145	      if (token2.type == OP_CLOSE_BRACKET)
3146		{
3147		  /* We treat the last '-' as a normal character.  */
3148		  re_string_skip_bytes (regexp, -token_len);
3149		  token->type = CHARACTER;
3150		}
3151	      else
3152		is_range_exp = 1;
3153	    }
3154	}
3155
3156      if (is_range_exp == 1)
3157	{
3158	  end_elem.opr.name = end_name_buf;
3159	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3160				       dfa, syntax, 1);
3161	  if (BE (ret != REG_NOERROR, 0))
3162	    {
3163	      *err = ret;
3164	      goto parse_bracket_exp_free_return;
3165	    }
3166
3167	  token_len = peek_token_bracket (token, regexp, syntax);
3168
3169#ifdef _LIBC
3170	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3171				  &start_elem, &end_elem);
3172#else
3173# ifdef RE_ENABLE_I18N
3174	  *err = build_range_exp (sbcset,
3175				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3176				  &range_alloc, &start_elem, &end_elem);
3177# else
3178	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3179# endif
3180#endif /* RE_ENABLE_I18N */
3181	  if (BE (*err != REG_NOERROR, 0))
3182	    goto parse_bracket_exp_free_return;
3183	}
3184      else
3185	{
3186	  switch (start_elem.type)
3187	    {
3188	    case SB_CHAR:
3189	      bitset_set (sbcset, start_elem.opr.ch);
3190	      break;
3191#ifdef RE_ENABLE_I18N
3192	    case MB_CHAR:
3193	      /* Check whether the array has enough space.  */
3194	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3195		{
3196		  wchar_t *new_mbchars;
3197		  /* Not enough, realloc it.  */
3198		  /* +1 in case of mbcset->nmbchars is 0.  */
3199		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3200		  /* Use realloc since array is NULL if *alloc == 0.  */
3201		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3202					    mbchar_alloc);
3203		  if (BE (new_mbchars == NULL, 0))
3204		    goto parse_bracket_exp_espace;
3205		  mbcset->mbchars = new_mbchars;
3206		}
3207	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3208	      break;
3209#endif /* RE_ENABLE_I18N */
3210	    case EQUIV_CLASS:
3211	      *err = build_equiv_class (sbcset,
3212#ifdef RE_ENABLE_I18N
3213					mbcset, &equiv_class_alloc,
3214#endif /* RE_ENABLE_I18N */
3215					start_elem.opr.name);
3216	      if (BE (*err != REG_NOERROR, 0))
3217		goto parse_bracket_exp_free_return;
3218	      break;
3219	    case COLL_SYM:
3220	      *err = build_collating_symbol (sbcset,
3221#ifdef RE_ENABLE_I18N
3222					     mbcset, &coll_sym_alloc,
3223#endif /* RE_ENABLE_I18N */
3224					     start_elem.opr.name);
3225	      if (BE (*err != REG_NOERROR, 0))
3226		goto parse_bracket_exp_free_return;
3227	      break;
3228	    case CHAR_CLASS:
3229	      *err = build_charclass (regexp->trans, sbcset,
3230#ifdef RE_ENABLE_I18N
3231				      mbcset, &char_class_alloc,
3232#endif /* RE_ENABLE_I18N */
3233				      start_elem.opr.name, syntax);
3234	      if (BE (*err != REG_NOERROR, 0))
3235	       goto parse_bracket_exp_free_return;
3236	      break;
3237	    default:
3238	      assert (0);
3239	      break;
3240	    }
3241	}
3242      if (BE (token->type == END_OF_RE, 0))
3243	{
3244	  *err = REG_EBRACK;
3245	  goto parse_bracket_exp_free_return;
3246	}
3247      if (token->type == OP_CLOSE_BRACKET)
3248	break;
3249    }
3250
3251  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3252
3253  /* If it is non-matching list.  */
3254  if (non_match)
3255    bitset_not (sbcset);
3256
3257#ifdef RE_ENABLE_I18N
3258  /* Ensure only single byte characters are set.  */
3259  if (dfa->mb_cur_max > 1)
3260    bitset_mask (sbcset, dfa->sb_char);
3261
3262  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3263      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3264						     || mbcset->non_match)))
3265    {
3266      bin_tree_t *mbc_tree;
3267      int sbc_idx;
3268      /* Build a tree for complex bracket.  */
3269      dfa->has_mb_node = 1;
3270      br_token.type = COMPLEX_BRACKET;
3271      br_token.opr.mbcset = mbcset;
3272      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3273      if (BE (mbc_tree == NULL, 0))
3274	goto parse_bracket_exp_espace;
3275      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3276	if (sbcset[sbc_idx])
3277	  break;
3278      /* If there are no bits set in sbcset, there is no point
3279	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3280      if (sbc_idx < BITSET_WORDS)
3281	{
3282	  /* Build a tree for simple bracket.  */
3283	  br_token.type = SIMPLE_BRACKET;
3284	  br_token.opr.sbcset = sbcset;
3285	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3286	  if (BE (work_tree == NULL, 0))
3287	    goto parse_bracket_exp_espace;
3288
3289	  /* Then join them by ALT node.  */
3290	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3291	  if (BE (work_tree == NULL, 0))
3292	    goto parse_bracket_exp_espace;
3293	}
3294      else
3295	{
3296	  re_free (sbcset);
3297	  work_tree = mbc_tree;
3298	}
3299    }
3300  else
3301#endif /* not RE_ENABLE_I18N */
3302    {
3303#ifdef RE_ENABLE_I18N
3304      free_charset (mbcset);
3305#endif
3306      /* Build a tree for simple bracket.  */
3307      br_token.type = SIMPLE_BRACKET;
3308      br_token.opr.sbcset = sbcset;
3309      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3310      if (BE (work_tree == NULL, 0))
3311	goto parse_bracket_exp_espace;
3312    }
3313  return work_tree;
3314
3315 parse_bracket_exp_espace:
3316  *err = REG_ESPACE;
3317 parse_bracket_exp_free_return:
3318  re_free (sbcset);
3319#ifdef RE_ENABLE_I18N
3320  free_charset (mbcset);
3321#endif /* RE_ENABLE_I18N */
3322  return NULL;
3323}
3324
3325/* Parse an element in the bracket expression.  */
3326
3327static reg_errcode_t
3328parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3329		       re_token_t *token, int token_len, re_dfa_t *dfa,
3330		       reg_syntax_t syntax, int accept_hyphen)
3331{
3332#ifdef RE_ENABLE_I18N
3333  int cur_char_size;
3334  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3335  if (cur_char_size > 1)
3336    {
3337      elem->type = MB_CHAR;
3338      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3339      re_string_skip_bytes (regexp, cur_char_size);
3340      return REG_NOERROR;
3341    }
3342#endif /* RE_ENABLE_I18N */
3343  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3344  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3345      || token->type == OP_OPEN_EQUIV_CLASS)
3346    return parse_bracket_symbol (elem, regexp, token);
3347  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3348    {
3349      /* A '-' must only appear as anything but a range indicator before
3350	 the closing bracket.  Everything else is an error.  */
3351      re_token_t token2;
3352      (void) peek_token_bracket (&token2, regexp, syntax);
3353      if (token2.type != OP_CLOSE_BRACKET)
3354	/* The actual error value is not standardized since this whole
3355	   case is undefined.  But ERANGE makes good sense.  */
3356	return REG_ERANGE;
3357    }
3358  elem->type = SB_CHAR;
3359  elem->opr.ch = token->opr.c;
3360  return REG_NOERROR;
3361}
3362
3363/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3364   such as [:<character_class>:], [.<collating_element>.], and
3365   [=<equivalent_class>=].  */
3366
3367static reg_errcode_t
3368parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3369		      re_token_t *token)
3370{
3371  unsigned char ch, delim = token->opr.c;
3372  int i = 0;
3373  if (re_string_eoi(regexp))
3374    return REG_EBRACK;
3375  for (;; ++i)
3376    {
3377      if (i >= BRACKET_NAME_BUF_SIZE)
3378	return REG_EBRACK;
3379      if (token->type == OP_OPEN_CHAR_CLASS)
3380	ch = re_string_fetch_byte_case (regexp);
3381      else
3382	ch = re_string_fetch_byte (regexp);
3383      if (re_string_eoi(regexp))
3384	return REG_EBRACK;
3385      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3386	break;
3387      elem->opr.name[i] = ch;
3388    }
3389  re_string_skip_bytes (regexp, 1);
3390  elem->opr.name[i] = '\0';
3391  switch (token->type)
3392    {
3393    case OP_OPEN_COLL_ELEM:
3394      elem->type = COLL_SYM;
3395      break;
3396    case OP_OPEN_EQUIV_CLASS:
3397      elem->type = EQUIV_CLASS;
3398      break;
3399    case OP_OPEN_CHAR_CLASS:
3400      elem->type = CHAR_CLASS;
3401      break;
3402    default:
3403      break;
3404    }
3405  return REG_NOERROR;
3406}
3407
3408  /* Helper function for parse_bracket_exp.
3409     Build the equivalence class which is represented by NAME.
3410     The result are written to MBCSET and SBCSET.
3411     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3412     is a pointer argument sinse we may update it.  */
3413
3414static reg_errcode_t
3415#ifdef RE_ENABLE_I18N
3416build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3417		   int *equiv_class_alloc, const unsigned char *name)
3418#else /* not RE_ENABLE_I18N */
3419build_equiv_class (bitset_t sbcset, const unsigned char *name)
3420#endif /* not RE_ENABLE_I18N */
3421{
3422#ifdef _LIBC
3423  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3424  if (nrules != 0)
3425    {
3426      const int32_t *table, *indirect;
3427      const unsigned char *weights, *extra, *cp;
3428      unsigned char char_buf[2];
3429      int32_t idx1, idx2;
3430      unsigned int ch;
3431      size_t len;
3432      /* This #include defines a local function!  */
3433# include <locale/weight.h>
3434      /* Calculate the index for equivalence class.  */
3435      cp = name;
3436      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3437      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3438					       _NL_COLLATE_WEIGHTMB);
3439      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3440						   _NL_COLLATE_EXTRAMB);
3441      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3442						_NL_COLLATE_INDIRECTMB);
3443      idx1 = findidx (&cp, -1);
3444      if (BE (idx1 == 0 || *cp != '\0', 0))
3445	/* This isn't a valid character.  */
3446	return REG_ECOLLATE;
3447
3448      /* Build single byte matcing table for this equivalence class.  */
3449      len = weights[idx1 & 0xffffff];
3450      for (ch = 0; ch < SBC_MAX; ++ch)
3451	{
3452	  char_buf[0] = ch;
3453	  cp = char_buf;
3454	  idx2 = findidx (&cp, 1);
3455/*
3456	  idx2 = table[ch];
3457*/
3458	  if (idx2 == 0)
3459	    /* This isn't a valid character.  */
3460	    continue;
3461	  /* Compare only if the length matches and the collation rule
3462	     index is the same.  */
3463	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3464	    {
3465	      int cnt = 0;
3466
3467	      while (cnt <= len &&
3468		     weights[(idx1 & 0xffffff) + 1 + cnt]
3469		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3470		++cnt;
3471
3472	      if (cnt > len)
3473		bitset_set (sbcset, ch);
3474	    }
3475	}
3476      /* Check whether the array has enough space.  */
3477      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3478	{
3479	  /* Not enough, realloc it.  */
3480	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3481	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3482	  /* Use realloc since the array is NULL if *alloc == 0.  */
3483	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3484						   int32_t,
3485						   new_equiv_class_alloc);
3486	  if (BE (new_equiv_classes == NULL, 0))
3487	    return REG_ESPACE;
3488	  mbcset->equiv_classes = new_equiv_classes;
3489	  *equiv_class_alloc = new_equiv_class_alloc;
3490	}
3491      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3492    }
3493  else
3494#endif /* _LIBC */
3495    {
3496      if (BE (strlen ((const char *) name) != 1, 0))
3497	return REG_ECOLLATE;
3498      bitset_set (sbcset, *name);
3499    }
3500  return REG_NOERROR;
3501}
3502
3503  /* Helper function for parse_bracket_exp.
3504     Build the character class which is represented by NAME.
3505     The result are written to MBCSET and SBCSET.
3506     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3507     is a pointer argument sinse we may update it.  */
3508
3509static reg_errcode_t
3510#ifdef RE_ENABLE_I18N
3511build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3512		 re_charset_t *mbcset, int *char_class_alloc,
3513		 const unsigned char *class_name, reg_syntax_t syntax)
3514#else /* not RE_ENABLE_I18N */
3515build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3516		 const unsigned char *class_name, reg_syntax_t syntax)
3517#endif /* not RE_ENABLE_I18N */
3518{
3519  int i;
3520  const char *name = (const char *) class_name;
3521
3522  /* In case of REG_ICASE "upper" and "lower" match the both of
3523     upper and lower cases.  */
3524  if ((syntax & RE_ICASE)
3525      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3526    name = "alpha";
3527
3528#ifdef RE_ENABLE_I18N
3529  /* Check the space of the arrays.  */
3530  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3531    {
3532      /* Not enough, realloc it.  */
3533      /* +1 in case of mbcset->nchar_classes is 0.  */
3534      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3535      /* Use realloc since array is NULL if *alloc == 0.  */
3536      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3537					       new_char_class_alloc);
3538      if (BE (new_char_classes == NULL, 0))
3539	return REG_ESPACE;
3540      mbcset->char_classes = new_char_classes;
3541      *char_class_alloc = new_char_class_alloc;
3542    }
3543  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3544#endif /* RE_ENABLE_I18N */
3545
3546#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3547  do {						\
3548    if (BE (trans != NULL, 0))			\
3549      {						\
3550	for (i = 0; i < SBC_MAX; ++i)		\
3551  	  if (ctype_func (i))			\
3552	    bitset_set (sbcset, trans[i]);	\
3553      }						\
3554    else					\
3555      {						\
3556	for (i = 0; i < SBC_MAX; ++i)		\
3557  	  if (ctype_func (i))			\
3558	    bitset_set (sbcset, i);		\
3559      }						\
3560  } while (0)
3561
3562  if (strcmp (name, "alnum") == 0)
3563    BUILD_CHARCLASS_LOOP (isalnum);
3564  else if (strcmp (name, "cntrl") == 0)
3565    BUILD_CHARCLASS_LOOP (iscntrl);
3566  else if (strcmp (name, "lower") == 0)
3567    BUILD_CHARCLASS_LOOP (islower);
3568  else if (strcmp (name, "space") == 0)
3569    BUILD_CHARCLASS_LOOP (isspace);
3570  else if (strcmp (name, "alpha") == 0)
3571    BUILD_CHARCLASS_LOOP (isalpha);
3572  else if (strcmp (name, "digit") == 0)
3573    BUILD_CHARCLASS_LOOP (isdigit);
3574  else if (strcmp (name, "print") == 0)
3575    BUILD_CHARCLASS_LOOP (isprint);
3576  else if (strcmp (name, "upper") == 0)
3577    BUILD_CHARCLASS_LOOP (isupper);
3578  else if (strcmp (name, "blank") == 0)
3579    BUILD_CHARCLASS_LOOP (isblank);
3580  else if (strcmp (name, "graph") == 0)
3581    BUILD_CHARCLASS_LOOP (isgraph);
3582  else if (strcmp (name, "punct") == 0)
3583    BUILD_CHARCLASS_LOOP (ispunct);
3584  else if (strcmp (name, "xdigit") == 0)
3585    BUILD_CHARCLASS_LOOP (isxdigit);
3586  else
3587    return REG_ECTYPE;
3588
3589  return REG_NOERROR;
3590}
3591
3592static bin_tree_t *
3593build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3594		    const unsigned char *class_name,
3595		    const unsigned char *extra, int non_match,
3596		    reg_errcode_t *err)
3597{
3598  re_bitset_ptr_t sbcset;
3599#ifdef RE_ENABLE_I18N
3600  re_charset_t *mbcset;
3601  int alloc = 0;
3602#endif /* not RE_ENABLE_I18N */
3603  reg_errcode_t ret;
3604  re_token_t br_token;
3605  bin_tree_t *tree;
3606
3607  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3608#ifdef RE_ENABLE_I18N
3609  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3610#endif /* RE_ENABLE_I18N */
3611
3612#ifdef RE_ENABLE_I18N
3613  if (BE (sbcset == NULL || mbcset == NULL, 0))
3614#else /* not RE_ENABLE_I18N */
3615  if (BE (sbcset == NULL, 0))
3616#endif /* not RE_ENABLE_I18N */
3617    {
3618      *err = REG_ESPACE;
3619      return NULL;
3620    }
3621
3622  if (non_match)
3623    {
3624#ifdef RE_ENABLE_I18N
3625      mbcset->non_match = 1;
3626#endif /* not RE_ENABLE_I18N */
3627    }
3628
3629  /* We don't care the syntax in this case.  */
3630  ret = build_charclass (trans, sbcset,
3631#ifdef RE_ENABLE_I18N
3632			 mbcset, &alloc,
3633#endif /* RE_ENABLE_I18N */
3634			 class_name, 0);
3635
3636  if (BE (ret != REG_NOERROR, 0))
3637    {
3638      re_free (sbcset);
3639#ifdef RE_ENABLE_I18N
3640      free_charset (mbcset);
3641#endif /* RE_ENABLE_I18N */
3642      *err = ret;
3643      return NULL;
3644    }
3645  /* \w match '_' also.  */
3646  for (; *extra; extra++)
3647    bitset_set (sbcset, *extra);
3648
3649  /* If it is non-matching list.  */
3650  if (non_match)
3651    bitset_not (sbcset);
3652
3653#ifdef RE_ENABLE_I18N
3654  /* Ensure only single byte characters are set.  */
3655  if (dfa->mb_cur_max > 1)
3656    bitset_mask (sbcset, dfa->sb_char);
3657#endif
3658
3659  /* Build a tree for simple bracket.  */
3660  br_token.type = SIMPLE_BRACKET;
3661  br_token.opr.sbcset = sbcset;
3662  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3663  if (BE (tree == NULL, 0))
3664    goto build_word_op_espace;
3665
3666#ifdef RE_ENABLE_I18N
3667  if (dfa->mb_cur_max > 1)
3668    {
3669      bin_tree_t *mbc_tree;
3670      /* Build a tree for complex bracket.  */
3671      br_token.type = COMPLEX_BRACKET;
3672      br_token.opr.mbcset = mbcset;
3673      dfa->has_mb_node = 1;
3674      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3675      if (BE (mbc_tree == NULL, 0))
3676	goto build_word_op_espace;
3677      /* Then join them by ALT node.  */
3678      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3679      if (BE (mbc_tree != NULL, 1))
3680	return tree;
3681    }
3682  else
3683    {
3684      free_charset (mbcset);
3685      return tree;
3686    }
3687#else /* not RE_ENABLE_I18N */
3688  return tree;
3689#endif /* not RE_ENABLE_I18N */
3690
3691 build_word_op_espace:
3692  re_free (sbcset);
3693#ifdef RE_ENABLE_I18N
3694  free_charset (mbcset);
3695#endif /* RE_ENABLE_I18N */
3696  *err = REG_ESPACE;
3697  return NULL;
3698}
3699
3700/* This is intended for the expressions like "a{1,3}".
3701   Fetch a number from `input', and return the number.
3702   Return -1, if the number field is empty like "{,1}".
3703   Return -2, If an error is occured.  */
3704
3705static int
3706fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3707{
3708  int num = -1;
3709  unsigned char c;
3710  while (1)
3711    {
3712      fetch_token (token, input, syntax);
3713      c = token->opr.c;
3714      if (BE (token->type == END_OF_RE, 0))
3715	return -2;
3716      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3717	break;
3718      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3719	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3720      num = (num > RE_DUP_MAX) ? -2 : num;
3721    }
3722  return num;
3723}
3724
3725#ifdef RE_ENABLE_I18N
3726static void
3727free_charset (re_charset_t *cset)
3728{
3729  re_free (cset->mbchars);
3730# ifdef _LIBC
3731  re_free (cset->coll_syms);
3732  re_free (cset->equiv_classes);
3733  re_free (cset->range_starts);
3734  re_free (cset->range_ends);
3735# endif
3736  re_free (cset->char_classes);
3737  re_free (cset);
3738}
3739#endif /* RE_ENABLE_I18N */
3740
3741/* Functions for binary tree operation.  */
3742
3743/* Create a tree node.  */
3744
3745static bin_tree_t *
3746create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3747	     re_token_type_t type)
3748{
3749  re_token_t t;
3750  t.type = type;
3751  return create_token_tree (dfa, left, right, &t);
3752}
3753
3754static bin_tree_t *
3755create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3756		   const re_token_t *token)
3757{
3758  bin_tree_t *tree;
3759  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3760    {
3761      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3762
3763      if (storage == NULL)
3764	return NULL;
3765      storage->next = dfa->str_tree_storage;
3766      dfa->str_tree_storage = storage;
3767      dfa->str_tree_storage_idx = 0;
3768    }
3769  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3770
3771  tree->parent = NULL;
3772  tree->left = left;
3773  tree->right = right;
3774  tree->token = *token;
3775  tree->token.duplicated = 0;
3776  tree->token.opt_subexp = 0;
3777  tree->first = NULL;
3778  tree->next = NULL;
3779  tree->node_idx = -1;
3780
3781  if (left != NULL)
3782    left->parent = tree;
3783  if (right != NULL)
3784    right->parent = tree;
3785  return tree;
3786}
3787
3788/* Mark the tree SRC as an optional subexpression.
3789   To be called from preorder or postorder.  */
3790
3791static reg_errcode_t
3792mark_opt_subexp (void *extra, bin_tree_t *node)
3793{
3794  int idx = (int) (long) extra;
3795  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3796    node->token.opt_subexp = 1;
3797
3798  return REG_NOERROR;
3799}
3800
3801/* Free the allocated memory inside NODE. */
3802
3803static void
3804free_token (re_token_t *node)
3805{
3806#ifdef RE_ENABLE_I18N
3807  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3808    free_charset (node->opr.mbcset);
3809  else
3810#endif /* RE_ENABLE_I18N */
3811    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3812      re_free (node->opr.sbcset);
3813}
3814
3815/* Worker function for tree walking.  Free the allocated memory inside NODE
3816   and its children. */
3817
3818static reg_errcode_t
3819free_tree (void *extra, bin_tree_t *node)
3820{
3821  free_token (&node->token);
3822  return REG_NOERROR;
3823}
3824
3825
3826/* Duplicate the node SRC, and return new node.  This is a preorder
3827   visit similar to the one implemented by the generic visitor, but
3828   we need more infrastructure to maintain two parallel trees --- so,
3829   it's easier to duplicate.  */
3830
3831static bin_tree_t *
3832duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3833{
3834  const bin_tree_t *node;
3835  bin_tree_t *dup_root;
3836  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3837
3838  for (node = root; ; )
3839    {
3840      /* Create a new tree and link it back to the current parent.  */
3841      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3842      if (*p_new == NULL)
3843	return NULL;
3844      (*p_new)->parent = dup_node;
3845      (*p_new)->token.duplicated = 1;
3846      dup_node = *p_new;
3847
3848      /* Go to the left node, or up and to the right.  */
3849      if (node->left)
3850	{
3851	  node = node->left;
3852	  p_new = &dup_node->left;
3853	}
3854      else
3855	{
3856	  const bin_tree_t *prev = NULL;
3857	  while (node->right == prev || node->right == NULL)
3858	    {
3859	      prev = node;
3860	      node = node->parent;
3861	      dup_node = dup_node->parent;
3862	      if (!node)
3863		return dup_root;
3864	    }
3865	  node = node->right;
3866	  p_new = &dup_node->right;
3867	}
3868    }
3869}
3870