1/* Extended regular expression matching and search library.
2   Copyright (C) 2002-2014 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU General Public
8   License as published by the Free Software Foundation; either
9   version 3 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   General Public License for more details.
15
16   You should have received a copy of the GNU General Public
17   License along with the GNU C Library; if not, see
18   <http://www.gnu.org/licenses/>.  */
19
20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21					  size_t length, reg_syntax_t syntax);
22static void re_compile_fastmap_iter (regex_t *bufp,
23				     const re_dfastate_t *init_state,
24				     char *fastmap);
25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26#ifdef RE_ENABLE_I18N
27static void free_charset (re_charset_t *cset);
28#endif /* RE_ENABLE_I18N */
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36			       reg_errcode_t (fn (void *, bin_tree_t *)),
37			       void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39				reg_errcode_t (fn (void *, bin_tree_t *)),
40				void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44				 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50				   unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53					 Idx node, bool root);
54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55static Idx fetch_number (re_string_t *input, re_token_t *token,
56			 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
58			reg_syntax_t syntax) internal_function;
59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60			  reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62				  re_token_t *token, reg_syntax_t syntax,
63				  Idx nest, reg_errcode_t *err);
64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65				 re_token_t *token, reg_syntax_t syntax,
66				 Idx nest, reg_errcode_t *err);
67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68				     re_token_t *token, reg_syntax_t syntax,
69				     Idx nest, reg_errcode_t *err);
70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71				  re_token_t *token, reg_syntax_t syntax,
72				  Idx nest, reg_errcode_t *err);
73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74				 re_dfa_t *dfa, re_token_t *token,
75				 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77				      re_token_t *token, reg_syntax_t syntax,
78				      reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80					    re_string_t *regexp,
81					    re_token_t *token, int token_len,
82					    re_dfa_t *dfa,
83					    reg_syntax_t syntax,
84					    bool accept_hyphen);
85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86					  re_string_t *regexp,
87					  re_token_t *token);
88#ifdef RE_ENABLE_I18N
89static reg_errcode_t build_equiv_class (bitset_t sbcset,
90					re_charset_t *mbcset,
91					Idx *equiv_class_alloc,
92					const unsigned char *name);
93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94				      bitset_t sbcset,
95				      re_charset_t *mbcset,
96				      Idx *char_class_alloc,
97				      const char *class_name,
98				      reg_syntax_t syntax);
99#else  /* not RE_ENABLE_I18N */
100static reg_errcode_t build_equiv_class (bitset_t sbcset,
101					const unsigned char *name);
102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103				      bitset_t sbcset,
104				      const char *class_name,
105				      reg_syntax_t syntax);
106#endif /* not RE_ENABLE_I18N */
107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108				       RE_TRANSLATE_TYPE trans,
109				       const char *class_name,
110				       const char *extra,
111				       bool non_match, reg_errcode_t *err);
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113				bin_tree_t *left, bin_tree_t *right,
114				re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116				      bin_tree_t *left, bin_tree_t *right,
117				      const re_token_t *token);
118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122
123/* This table gives an error message for each of the error codes listed
124   in regex.h.  Obviously the order here has to be same as there.
125   POSIX doesn't require that we do anything for REG_NOERROR,
126   but why not be nice?  */
127
128static const char __re_error_msgid[] =
129  {
130#define REG_NOERROR_IDX	0
131    gettext_noop ("Success")	/* REG_NOERROR */
132    "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134    gettext_noop ("No match")	/* REG_NOMATCH */
135    "\0"
136#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138    "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141    "\0"
142#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144    "\0"
145#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147    "\0"
148#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150    "\0"
151#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153    "\0"
154#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156    "\0"
157#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159    "\0"
160#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162    "\0"
163#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164    gettext_noop ("Invalid range end")	/* REG_ERANGE */
165    "\0"
166#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167    gettext_noop ("Memory exhausted") /* REG_ESPACE */
168    "\0"
169#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171    "\0"
172#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173    gettext_noop ("Premature end of regular expression") /* REG_EEND */
174    "\0"
175#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176    gettext_noop ("Regular expression too big") /* REG_ESIZE */
177    "\0"
178#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180  };
181
182static const size_t __re_error_msgid_idx[] =
183  {
184    REG_NOERROR_IDX,
185    REG_NOMATCH_IDX,
186    REG_BADPAT_IDX,
187    REG_ECOLLATE_IDX,
188    REG_ECTYPE_IDX,
189    REG_EESCAPE_IDX,
190    REG_ESUBREG_IDX,
191    REG_EBRACK_IDX,
192    REG_EPAREN_IDX,
193    REG_EBRACE_IDX,
194    REG_BADBR_IDX,
195    REG_ERANGE_IDX,
196    REG_ESPACE_IDX,
197    REG_BADRPT_IDX,
198    REG_EEND_IDX,
199    REG_ESIZE_IDX,
200    REG_ERPAREN_IDX
201  };
202
203/* Entry points for GNU code.  */
204
205/* re_compile_pattern is the GNU regular expression compiler: it
206   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207   Returns 0 if the pattern was valid, otherwise an error string.
208
209   Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
210   are set in BUFP on entry.  */
211
212#ifdef _LIBC
213const char *
214re_compile_pattern (pattern, length, bufp)
215    const char *pattern;
216    size_t length;
217    struct re_pattern_buffer *bufp;
218#else /* size_t might promote */
219const char *
220re_compile_pattern (const char *pattern, size_t length,
221		    struct re_pattern_buffer *bufp)
222#endif
223{
224  reg_errcode_t ret;
225
226  /* And GNU code determines whether or not to get register information
227     by passing null for the REGS argument to re_match, etc., not by
228     setting no_sub, unless RE_NO_SUB is set.  */
229  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230
231  /* Match anchors at newline.  */
232  bufp->newline_anchor = 1;
233
234  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235
236  if (!ret)
237    return NULL;
238  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239}
240#ifdef _LIBC
241weak_alias (__re_compile_pattern, re_compile_pattern)
242#endif
243
244/* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
245   also be assigned to arbitrarily: each pattern buffer stores its own
246   syntax, so it can be changed between regex compilations.  */
247/* This has no initializer because initialized variables in Emacs
248   become read-only after dumping.  */
249reg_syntax_t re_syntax_options;
250
251
252/* Specify the precise syntax of regexps for compilation.  This provides
253   for compatibility for various utilities which historically have
254   different, incompatible syntaxes.
255
256   The argument SYNTAX is a bit mask comprised of the various bits
257   defined in regex.h.  We return the old syntax.  */
258
259reg_syntax_t
260re_set_syntax (syntax)
261    reg_syntax_t syntax;
262{
263  reg_syntax_t ret = re_syntax_options;
264
265  re_syntax_options = syntax;
266  return ret;
267}
268#ifdef _LIBC
269weak_alias (__re_set_syntax, re_set_syntax)
270#endif
271
272int
273re_compile_fastmap (bufp)
274    struct re_pattern_buffer *bufp;
275{
276  re_dfa_t *dfa = bufp->buffer;
277  char *fastmap = bufp->fastmap;
278
279  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281  if (dfa->init_state != dfa->init_state_word)
282    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283  if (dfa->init_state != dfa->init_state_nl)
284    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285  if (dfa->init_state != dfa->init_state_begbuf)
286    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287  bufp->fastmap_accurate = 1;
288  return 0;
289}
290#ifdef _LIBC
291weak_alias (__re_compile_fastmap, re_compile_fastmap)
292#endif
293
294static inline void
295__attribute__ ((always_inline))
296re_set_fastmap (char *fastmap, bool icase, int ch)
297{
298  fastmap[ch] = 1;
299  if (icase)
300    fastmap[tolower (ch)] = 1;
301}
302
303/* Helper function for re_compile_fastmap.
304   Compile fastmap for the initial_state INIT_STATE.  */
305
306static void
307re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308			 char *fastmap)
309{
310  re_dfa_t *dfa = bufp->buffer;
311  Idx node_cnt;
312  bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314    {
315      Idx node = init_state->nodes.elems[node_cnt];
316      re_token_type_t type = dfa->nodes[node].type;
317
318      if (type == CHARACTER)
319	{
320	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321#ifdef RE_ENABLE_I18N
322	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323	    {
324	      unsigned char buf[MB_LEN_MAX];
325	      unsigned char *p;
326	      wchar_t wc;
327	      mbstate_t state;
328
329	      p = buf;
330	      *p++ = dfa->nodes[node].opr.c;
331	      while (++node < dfa->nodes_len
332		     &&	dfa->nodes[node].type == CHARACTER
333		     && dfa->nodes[node].mb_partial)
334		*p++ = dfa->nodes[node].opr.c;
335	      memset (&state, '\0', sizeof (state));
336	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
337			     &state) == p - buf
338		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
339		      != (size_t) -1))
340		re_set_fastmap (fastmap, false, buf[0]);
341	    }
342#endif
343	}
344      else if (type == SIMPLE_BRACKET)
345	{
346	  int i, ch;
347	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348	    {
349	      int j;
350	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352		if (w & ((bitset_word_t) 1 << j))
353		  re_set_fastmap (fastmap, icase, ch);
354	    }
355	}
356#ifdef RE_ENABLE_I18N
357      else if (type == COMPLEX_BRACKET)
358	{
359	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
360	  Idx i;
361
362# ifdef _LIBC
363	  /* See if we have to try all bytes which start multiple collation
364	     elements.
365	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366		  collation element, and don't catch 'b' since 'b' is
367		  the only collation element which starts from 'b' (and
368		  it is caught by SIMPLE_BRACKET).  */
369	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370		  && (cset->ncoll_syms || cset->nranges))
371		{
372		  const int32_t *table = (const int32_t *)
373		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374		  for (i = 0; i < SBC_MAX; ++i)
375		    if (table[i] < 0)
376		      re_set_fastmap (fastmap, icase, i);
377		}
378# endif /* _LIBC */
379
380	  /* See if we have to start the match at all multibyte characters,
381	     i.e. where we would not find an invalid sequence.  This only
382	     applies to multibyte character sets; for single byte character
383	     sets, the SIMPLE_BRACKET again suffices.  */
384	  if (dfa->mb_cur_max > 1
385	      && (cset->nchar_classes || cset->non_match || cset->nranges
386# ifdef _LIBC
387		  || cset->nequiv_classes
388# endif /* _LIBC */
389		 ))
390	    {
391	      unsigned char c = 0;
392	      do
393		{
394		  mbstate_t mbs;
395		  memset (&mbs, 0, sizeof (mbs));
396		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397		    re_set_fastmap (fastmap, false, (int) c);
398		}
399	      while (++c != 0);
400	    }
401
402	  else
403	    {
404	      /* ... Else catch all bytes which can start the mbchars.  */
405	      for (i = 0; i < cset->nmbchars; ++i)
406		{
407		  char buf[256];
408		  mbstate_t state;
409		  memset (&state, '\0', sizeof (state));
410		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413		    {
414		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415			  != (size_t) -1)
416			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
417		    }
418		}
419	    }
420	}
421#endif /* RE_ENABLE_I18N */
422      else if (type == OP_PERIOD
423#ifdef RE_ENABLE_I18N
424	       || type == OP_UTF8_PERIOD
425#endif /* RE_ENABLE_I18N */
426	       || type == END_OF_RE)
427	{
428	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429	  if (type == END_OF_RE)
430	    bufp->can_be_null = 1;
431	  return;
432	}
433    }
434}
435
436/* Entry point for POSIX code.  */
437/* regcomp takes a regular expression as a string and compiles it.
438
439   PREG is a regex_t *.  We do not expect any fields to be initialized,
440   since POSIX says we shouldn't.  Thus, we set
441
442     'buffer' to the compiled pattern;
443     'used' to the length of the compiled pattern;
444     'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445       REG_EXTENDED bit in CFLAGS is set; otherwise, to
446       RE_SYNTAX_POSIX_BASIC;
447     'newline_anchor' to REG_NEWLINE being set in CFLAGS;
448     'fastmap' to an allocated space for the fastmap;
449     'fastmap_accurate' to zero;
450     're_nsub' to the number of subexpressions in PATTERN.
451
452   PATTERN is the address of the pattern string.
453
454   CFLAGS is a series of bits which affect compilation.
455
456     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457     use POSIX basic syntax.
458
459     If REG_NEWLINE is set, then . and [^...] don't match newline.
460     Also, regexec will try a match beginning after every newline.
461
462     If REG_ICASE is set, then we considers upper- and lowercase
463     versions of letters to be equivalent when matching.
464
465     If REG_NOSUB is set, then when PREG is passed to regexec, that
466     routine will report only success or failure, and nothing about the
467     registers.
468
469   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
470   the return codes and their meanings.)  */
471
472int
473regcomp (preg, pattern, cflags)
474    regex_t *_Restrict_ preg;
475    const char *_Restrict_ pattern;
476    int cflags;
477{
478  reg_errcode_t ret;
479  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480			 : RE_SYNTAX_POSIX_BASIC);
481
482  preg->buffer = NULL;
483  preg->allocated = 0;
484  preg->used = 0;
485
486  /* Try to allocate space for the fastmap.  */
487  preg->fastmap = re_malloc (char, SBC_MAX);
488  if (BE (preg->fastmap == NULL, 0))
489    return REG_ESPACE;
490
491  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492
493  /* If REG_NEWLINE is set, newlines are treated differently.  */
494  if (cflags & REG_NEWLINE)
495    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
496      syntax &= ~RE_DOT_NEWLINE;
497      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498      /* It also changes the matching behavior.  */
499      preg->newline_anchor = 1;
500    }
501  else
502    preg->newline_anchor = 0;
503  preg->no_sub = !!(cflags & REG_NOSUB);
504  preg->translate = NULL;
505
506  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507
508  /* POSIX doesn't distinguish between an unmatched open-group and an
509     unmatched close-group: both are REG_EPAREN.  */
510  if (ret == REG_ERPAREN)
511    ret = REG_EPAREN;
512
513  /* We have already checked preg->fastmap != NULL.  */
514  if (BE (ret == REG_NOERROR, 1))
515    /* Compute the fastmap now, since regexec cannot modify the pattern
516       buffer.  This function never fails in this implementation.  */
517    (void) re_compile_fastmap (preg);
518  else
519    {
520      /* Some error occurred while compiling the expression.  */
521      re_free (preg->fastmap);
522      preg->fastmap = NULL;
523    }
524
525  return (int) ret;
526}
527#ifdef _LIBC
528weak_alias (__regcomp, regcomp)
529#endif
530
531/* Returns a message corresponding to an error code, ERRCODE, returned
532   from either regcomp or regexec.   We don't use PREG here.  */
533
534#ifdef _LIBC
535size_t
536regerror (errcode, preg, errbuf, errbuf_size)
537    int errcode;
538    const regex_t *_Restrict_ preg;
539    char *_Restrict_ errbuf;
540    size_t errbuf_size;
541#else /* size_t might promote */
542size_t
543regerror (int errcode, const regex_t *_Restrict_ preg,
544	  char *_Restrict_ errbuf, size_t errbuf_size)
545#endif
546{
547  const char *msg;
548  size_t msg_size;
549
550  if (BE (errcode < 0
551	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
552			       / sizeof (__re_error_msgid_idx[0])), 0))
553    /* Only error codes returned by the rest of the code should be passed
554       to this routine.  If we are given anything else, or if other regex
555       code generates an invalid error code, then the program has a bug.
556       Dump core so we can fix it.  */
557    abort ();
558
559  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560
561  msg_size = strlen (msg) + 1; /* Includes the null.  */
562
563  if (BE (errbuf_size != 0, 1))
564    {
565      size_t cpy_size = msg_size;
566      if (BE (msg_size > errbuf_size, 0))
567	{
568	  cpy_size = errbuf_size - 1;
569	  errbuf[cpy_size] = '\0';
570	}
571      memcpy (errbuf, msg, cpy_size);
572    }
573
574  return msg_size;
575}
576#ifdef _LIBC
577weak_alias (__regerror, regerror)
578#endif
579
580
581#ifdef RE_ENABLE_I18N
582/* This static array is used for the map to single-byte characters when
583   UTF-8 is used.  Otherwise we would allocate memory just to initialize
584   it the same all the time.  UTF-8 is the preferred encoding so this is
585   a worthwhile optimization.  */
586static const bitset_t utf8_sb_map =
587{
588  /* Set the first 128 bits.  */
589# if defined __GNUC__ && !defined __STRICT_ANSI__
590  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591# else
592#  if 4 * BITSET_WORD_BITS < ASCII_CHARS
593#   error "bitset_word_t is narrower than 32 bits"
594#  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595  BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596#  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597  BITSET_WORD_MAX, BITSET_WORD_MAX,
598#  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
599  BITSET_WORD_MAX,
600#  endif
601  (BITSET_WORD_MAX
602   >> (SBC_MAX % BITSET_WORD_BITS == 0
603       ? 0
604       : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
605# endif
606};
607#endif
608
609
610static void
611free_dfa_content (re_dfa_t *dfa)
612{
613  Idx i, j;
614
615  if (dfa->nodes)
616    for (i = 0; i < dfa->nodes_len; ++i)
617      free_token (dfa->nodes + i);
618  re_free (dfa->nexts);
619  for (i = 0; i < dfa->nodes_len; ++i)
620    {
621      if (dfa->eclosures != NULL)
622	re_node_set_free (dfa->eclosures + i);
623      if (dfa->inveclosures != NULL)
624	re_node_set_free (dfa->inveclosures + i);
625      if (dfa->edests != NULL)
626	re_node_set_free (dfa->edests + i);
627    }
628  re_free (dfa->edests);
629  re_free (dfa->eclosures);
630  re_free (dfa->inveclosures);
631  re_free (dfa->nodes);
632
633  if (dfa->state_table)
634    for (i = 0; i <= dfa->state_hash_mask; ++i)
635      {
636	struct re_state_table_entry *entry = dfa->state_table + i;
637	for (j = 0; j < entry->num; ++j)
638	  {
639	    re_dfastate_t *state = entry->array[j];
640	    free_state (state);
641	  }
642	re_free (entry->array);
643      }
644  re_free (dfa->state_table);
645#ifdef RE_ENABLE_I18N
646  if (dfa->sb_char != utf8_sb_map)
647    re_free (dfa->sb_char);
648#endif
649  re_free (dfa->subexp_map);
650#ifdef DEBUG
651  re_free (dfa->re_str);
652#endif
653
654  re_free (dfa);
655}
656
657
658/* Free dynamically allocated space used by PREG.  */
659
660void
661regfree (preg)
662    regex_t *preg;
663{
664  re_dfa_t *dfa = preg->buffer;
665  if (BE (dfa != NULL, 1))
666    {
667      lock_fini (dfa->lock);
668      free_dfa_content (dfa);
669    }
670  preg->buffer = NULL;
671  preg->allocated = 0;
672
673  re_free (preg->fastmap);
674  preg->fastmap = NULL;
675
676  re_free (preg->translate);
677  preg->translate = NULL;
678}
679#ifdef _LIBC
680weak_alias (__regfree, regfree)
681#endif
682
683/* Entry points compatible with 4.2 BSD regex library.  We don't define
684   them unless specifically requested.  */
685
686#if defined _REGEX_RE_COMP || defined _LIBC
687
688/* BSD has one and only one pattern buffer.  */
689static struct re_pattern_buffer re_comp_buf;
690
691char *
692# ifdef _LIBC
693/* Make these definitions weak in libc, so POSIX programs can redefine
694   these names if they don't use our functions, and still use
695   regcomp/regexec above without link errors.  */
696weak_function
697# endif
698re_comp (s)
699     const char *s;
700{
701  reg_errcode_t ret;
702  char *fastmap;
703
704  if (!s)
705    {
706      if (!re_comp_buf.buffer)
707	return gettext ("No previous regular expression");
708      return 0;
709    }
710
711  if (re_comp_buf.buffer)
712    {
713      fastmap = re_comp_buf.fastmap;
714      re_comp_buf.fastmap = NULL;
715      __regfree (&re_comp_buf);
716      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
717      re_comp_buf.fastmap = fastmap;
718    }
719
720  if (re_comp_buf.fastmap == NULL)
721    {
722      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
723      if (re_comp_buf.fastmap == NULL)
724	return (char *) gettext (__re_error_msgid
725				 + __re_error_msgid_idx[(int) REG_ESPACE]);
726    }
727
728  /* Since 're_exec' always passes NULL for the 'regs' argument, we
729     don't need to initialize the pattern buffer fields which affect it.  */
730
731  /* Match anchors at newlines.  */
732  re_comp_buf.newline_anchor = 1;
733
734  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735
736  if (!ret)
737    return NULL;
738
739  /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
740  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
741}
742
743#ifdef _LIBC
744libc_freeres_fn (free_mem)
745{
746  __regfree (&re_comp_buf);
747}
748#endif
749
750#endif /* _REGEX_RE_COMP */
751
752/* Internal entry point.
753   Compile the regular expression PATTERN, whose length is LENGTH.
754   SYNTAX indicate regular expression's syntax.  */
755
756static reg_errcode_t
757re_compile_internal (regex_t *preg, const char * pattern, size_t length,
758		     reg_syntax_t syntax)
759{
760  reg_errcode_t err = REG_NOERROR;
761  re_dfa_t *dfa;
762  re_string_t regexp;
763
764  /* Initialize the pattern buffer.  */
765  preg->fastmap_accurate = 0;
766  preg->syntax = syntax;
767  preg->not_bol = preg->not_eol = 0;
768  preg->used = 0;
769  preg->re_nsub = 0;
770  preg->can_be_null = 0;
771  preg->regs_allocated = REGS_UNALLOCATED;
772
773  /* Initialize the dfa.  */
774  dfa = preg->buffer;
775  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
776    {
777      /* If zero allocated, but buffer is non-null, try to realloc
778	 enough space.  This loses if buffer's address is bogus, but
779	 that is the user's responsibility.  If ->buffer is NULL this
780	 is a simple allocation.  */
781      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
782      if (dfa == NULL)
783	return REG_ESPACE;
784      preg->allocated = sizeof (re_dfa_t);
785      preg->buffer = dfa;
786    }
787  preg->used = sizeof (re_dfa_t);
788
789  err = init_dfa (dfa, length);
790  if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
791    err = REG_ESPACE;
792  if (BE (err != REG_NOERROR, 0))
793    {
794      free_dfa_content (dfa);
795      preg->buffer = NULL;
796      preg->allocated = 0;
797      return err;
798    }
799#ifdef DEBUG
800  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
801  dfa->re_str = re_malloc (char, length + 1);
802  strncpy (dfa->re_str, pattern, length + 1);
803#endif
804
805  err = re_string_construct (&regexp, pattern, length, preg->translate,
806			     (syntax & RE_ICASE) != 0, dfa);
807  if (BE (err != REG_NOERROR, 0))
808    {
809    re_compile_internal_free_return:
810      free_workarea_compile (preg);
811      re_string_destruct (&regexp);
812      lock_fini (dfa->lock);
813      free_dfa_content (dfa);
814      preg->buffer = NULL;
815      preg->allocated = 0;
816      return err;
817    }
818
819  /* Parse the regular expression, and build a structure tree.  */
820  preg->re_nsub = 0;
821  dfa->str_tree = parse (&regexp, preg, syntax, &err);
822  if (BE (dfa->str_tree == NULL, 0))
823    goto re_compile_internal_free_return;
824
825  /* Analyze the tree and create the nfa.  */
826  err = analyze (preg);
827  if (BE (err != REG_NOERROR, 0))
828    goto re_compile_internal_free_return;
829
830#ifdef RE_ENABLE_I18N
831  /* If possible, do searching in single byte encoding to speed things up.  */
832  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
833    optimize_utf8 (dfa);
834#endif
835
836  /* Then create the initial state of the dfa.  */
837  err = create_initial_state (dfa);
838
839  /* Release work areas.  */
840  free_workarea_compile (preg);
841  re_string_destruct (&regexp);
842
843  if (BE (err != REG_NOERROR, 0))
844    {
845      lock_fini (dfa->lock);
846      free_dfa_content (dfa);
847      preg->buffer = NULL;
848      preg->allocated = 0;
849    }
850
851  return err;
852}
853
854/* Initialize DFA.  We use the length of the regular expression PAT_LEN
855   as the initial length of some arrays.  */
856
857static reg_errcode_t
858init_dfa (re_dfa_t *dfa, size_t pat_len)
859{
860  __re_size_t table_size;
861#ifndef _LIBC
862  const char *codeset_name;
863#endif
864#ifdef RE_ENABLE_I18N
865  size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
866#else
867  size_t max_i18n_object_size = 0;
868#endif
869  size_t max_object_size =
870    MAX (sizeof (struct re_state_table_entry),
871	 MAX (sizeof (re_token_t),
872	      MAX (sizeof (re_node_set),
873		   MAX (sizeof (regmatch_t),
874			max_i18n_object_size))));
875
876  memset (dfa, '\0', sizeof (re_dfa_t));
877
878  /* Force allocation of str_tree_storage the first time.  */
879  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
880
881  /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
882     calculation below, and for similar doubling calculations
883     elsewhere.  And it's <= rather than <, because some of the
884     doubling calculations add 1 afterwards.  */
885  if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
886    return REG_ESPACE;
887
888  dfa->nodes_alloc = pat_len + 1;
889  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
890
891  /*  table_size = 2 ^ ceil(log pat_len) */
892  for (table_size = 1; ; table_size <<= 1)
893    if (table_size > pat_len)
894      break;
895
896  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
897  dfa->state_hash_mask = table_size - 1;
898
899  dfa->mb_cur_max = MB_CUR_MAX;
900#ifdef _LIBC
901  if (dfa->mb_cur_max == 6
902      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
903    dfa->is_utf8 = 1;
904  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
905		       != 0);
906#else
907  codeset_name = nl_langinfo (CODESET);
908  if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
909      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
910      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
911      && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
912    dfa->is_utf8 = 1;
913
914  /* We check exhaustively in the loop below if this charset is a
915     superset of ASCII.  */
916  dfa->map_notascii = 0;
917#endif
918
919#ifdef RE_ENABLE_I18N
920  if (dfa->mb_cur_max > 1)
921    {
922      if (dfa->is_utf8)
923	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
924      else
925	{
926	  int i, j, ch;
927
928	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
929	  if (BE (dfa->sb_char == NULL, 0))
930	    return REG_ESPACE;
931
932	  /* Set the bits corresponding to single byte chars.  */
933	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
934	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
935	      {
936		wint_t wch = __btowc (ch);
937		if (wch != WEOF)
938		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
939# ifndef _LIBC
940		if (isascii (ch) && wch != ch)
941		  dfa->map_notascii = 1;
942# endif
943	      }
944	}
945    }
946#endif
947
948  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
949    return REG_ESPACE;
950  return REG_NOERROR;
951}
952
953/* Initialize WORD_CHAR table, which indicate which character is
954   "word".  In this case "word" means that it is the word construction
955   character used by some operators like "\<", "\>", etc.  */
956
957static void
958internal_function
959init_word_char (re_dfa_t *dfa)
960{
961  int i = 0;
962  int j;
963  int ch = 0;
964  dfa->word_ops_used = 1;
965  if (BE (dfa->map_notascii == 0, 1))
966    {
967      bitset_word_t bits0 = 0x00000000;
968      bitset_word_t bits1 = 0x03ff0000;
969      bitset_word_t bits2 = 0x87fffffe;
970      bitset_word_t bits3 = 0x07fffffe;
971      if (BITSET_WORD_BITS == 64)
972	{
973	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
974	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
975	  i = 2;
976	}
977      else if (BITSET_WORD_BITS == 32)
978	{
979	  dfa->word_char[0] = bits0;
980	  dfa->word_char[1] = bits1;
981	  dfa->word_char[2] = bits2;
982	  dfa->word_char[3] = bits3;
983	  i = 4;
984	}
985      else
986        goto general_case;
987      ch = 128;
988
989      if (BE (dfa->is_utf8, 1))
990	{
991	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
992	  return;
993	}
994    }
995
996 general_case:
997  for (; i < BITSET_WORDS; ++i)
998    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
999      if (isalnum (ch) || ch == '_')
1000	dfa->word_char[i] |= (bitset_word_t) 1 << j;
1001}
1002
1003/* Free the work area which are only used while compiling.  */
1004
1005static void
1006free_workarea_compile (regex_t *preg)
1007{
1008  re_dfa_t *dfa = preg->buffer;
1009  bin_tree_storage_t *storage, *next;
1010  for (storage = dfa->str_tree_storage; storage; storage = next)
1011    {
1012      next = storage->next;
1013      re_free (storage);
1014    }
1015  dfa->str_tree_storage = NULL;
1016  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1017  dfa->str_tree = NULL;
1018  re_free (dfa->org_indices);
1019  dfa->org_indices = NULL;
1020}
1021
1022/* Create initial states for all contexts.  */
1023
1024static reg_errcode_t
1025create_initial_state (re_dfa_t *dfa)
1026{
1027  Idx first, i;
1028  reg_errcode_t err;
1029  re_node_set init_nodes;
1030
1031  /* Initial states have the epsilon closure of the node which is
1032     the first node of the regular expression.  */
1033  first = dfa->str_tree->first->node_idx;
1034  dfa->init_node = first;
1035  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1036  if (BE (err != REG_NOERROR, 0))
1037    return err;
1038
1039  /* The back-references which are in initial states can epsilon transit,
1040     since in this case all of the subexpressions can be null.
1041     Then we add epsilon closures of the nodes which are the next nodes of
1042     the back-references.  */
1043  if (dfa->nbackref > 0)
1044    for (i = 0; i < init_nodes.nelem; ++i)
1045      {
1046	Idx node_idx = init_nodes.elems[i];
1047	re_token_type_t type = dfa->nodes[node_idx].type;
1048
1049	Idx clexp_idx;
1050	if (type != OP_BACK_REF)
1051	  continue;
1052	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1053	  {
1054	    re_token_t *clexp_node;
1055	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1056	    if (clexp_node->type == OP_CLOSE_SUBEXP
1057		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1058	      break;
1059	  }
1060	if (clexp_idx == init_nodes.nelem)
1061	  continue;
1062
1063	if (type == OP_BACK_REF)
1064	  {
1065	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1066	    if (!re_node_set_contains (&init_nodes, dest_idx))
1067	      {
1068		reg_errcode_t merge_err
1069                  = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1070		if (merge_err != REG_NOERROR)
1071		  return merge_err;
1072		i = 0;
1073	      }
1074	  }
1075      }
1076
1077  /* It must be the first time to invoke acquire_state.  */
1078  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1079  /* We don't check ERR here, since the initial state must not be NULL.  */
1080  if (BE (dfa->init_state == NULL, 0))
1081    return err;
1082  if (dfa->init_state->has_constraint)
1083    {
1084      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1085						       CONTEXT_WORD);
1086      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1087						     CONTEXT_NEWLINE);
1088      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1089							 &init_nodes,
1090							 CONTEXT_NEWLINE
1091							 | CONTEXT_BEGBUF);
1092      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1093	      || dfa->init_state_begbuf == NULL, 0))
1094	return err;
1095    }
1096  else
1097    dfa->init_state_word = dfa->init_state_nl
1098      = dfa->init_state_begbuf = dfa->init_state;
1099
1100  re_node_set_free (&init_nodes);
1101  return REG_NOERROR;
1102}
1103
1104#ifdef RE_ENABLE_I18N
1105/* If it is possible to do searching in single byte encoding instead of UTF-8
1106   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1107   DFA nodes where needed.  */
1108
1109static void
1110optimize_utf8 (re_dfa_t *dfa)
1111{
1112  Idx node;
1113  int i;
1114  bool mb_chars = false;
1115  bool has_period = false;
1116
1117  for (node = 0; node < dfa->nodes_len; ++node)
1118    switch (dfa->nodes[node].type)
1119      {
1120      case CHARACTER:
1121	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1122	  mb_chars = true;
1123	break;
1124      case ANCHOR:
1125	switch (dfa->nodes[node].opr.ctx_type)
1126	  {
1127	  case LINE_FIRST:
1128	  case LINE_LAST:
1129	  case BUF_FIRST:
1130	  case BUF_LAST:
1131	    break;
1132	  default:
1133	    /* Word anchors etc. cannot be handled.  It's okay to test
1134	       opr.ctx_type since constraints (for all DFA nodes) are
1135	       created by ORing one or more opr.ctx_type values.  */
1136	    return;
1137	  }
1138	break;
1139      case OP_PERIOD:
1140	has_period = true;
1141	break;
1142      case OP_BACK_REF:
1143      case OP_ALT:
1144      case END_OF_RE:
1145      case OP_DUP_ASTERISK:
1146      case OP_OPEN_SUBEXP:
1147      case OP_CLOSE_SUBEXP:
1148	break;
1149      case COMPLEX_BRACKET:
1150	return;
1151      case SIMPLE_BRACKET:
1152	/* Just double check.  */
1153	{
1154	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1155			? 0
1156			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1157	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1158	    {
1159	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1160		return;
1161	      rshift = 0;
1162	    }
1163	}
1164	break;
1165      default:
1166	abort ();
1167      }
1168
1169  if (mb_chars || has_period)
1170    for (node = 0; node < dfa->nodes_len; ++node)
1171      {
1172	if (dfa->nodes[node].type == CHARACTER
1173	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1174	  dfa->nodes[node].mb_partial = 0;
1175	else if (dfa->nodes[node].type == OP_PERIOD)
1176	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1177      }
1178
1179  /* The search can be in single byte locale.  */
1180  dfa->mb_cur_max = 1;
1181  dfa->is_utf8 = 0;
1182  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1183}
1184#endif
1185
1186/* Analyze the structure tree, and calculate "first", "next", "edest",
1187   "eclosure", and "inveclosure".  */
1188
1189static reg_errcode_t
1190analyze (regex_t *preg)
1191{
1192  re_dfa_t *dfa = preg->buffer;
1193  reg_errcode_t ret;
1194
1195  /* Allocate arrays.  */
1196  dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1197  dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1198  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1199  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1200  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1201	  || dfa->eclosures == NULL, 0))
1202    return REG_ESPACE;
1203
1204  dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1205  if (dfa->subexp_map != NULL)
1206    {
1207      Idx i;
1208      for (i = 0; i < preg->re_nsub; i++)
1209	dfa->subexp_map[i] = i;
1210      preorder (dfa->str_tree, optimize_subexps, dfa);
1211      for (i = 0; i < preg->re_nsub; i++)
1212	if (dfa->subexp_map[i] != i)
1213	  break;
1214      if (i == preg->re_nsub)
1215	{
1216	  free (dfa->subexp_map);
1217	  dfa->subexp_map = NULL;
1218	}
1219    }
1220
1221  ret = postorder (dfa->str_tree, lower_subexps, preg);
1222  if (BE (ret != REG_NOERROR, 0))
1223    return ret;
1224  ret = postorder (dfa->str_tree, calc_first, dfa);
1225  if (BE (ret != REG_NOERROR, 0))
1226    return ret;
1227  preorder (dfa->str_tree, calc_next, dfa);
1228  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1229  if (BE (ret != REG_NOERROR, 0))
1230    return ret;
1231  ret = calc_eclosure (dfa);
1232  if (BE (ret != REG_NOERROR, 0))
1233    return ret;
1234
1235  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1236     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1237  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1238      || dfa->nbackref)
1239    {
1240      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1241      if (BE (dfa->inveclosures == NULL, 0))
1242	return REG_ESPACE;
1243      ret = calc_inveclosure (dfa);
1244    }
1245
1246  return ret;
1247}
1248
1249/* Our parse trees are very unbalanced, so we cannot use a stack to
1250   implement parse tree visits.  Instead, we use parent pointers and
1251   some hairy code in these two functions.  */
1252static reg_errcode_t
1253postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254	   void *extra)
1255{
1256  bin_tree_t *node, *prev;
1257
1258  for (node = root; ; )
1259    {
1260      /* Descend down the tree, preferably to the left (or to the right
1261	 if that's the only child).  */
1262      while (node->left || node->right)
1263	if (node->left)
1264	  node = node->left;
1265	else
1266	  node = node->right;
1267
1268      do
1269	{
1270	  reg_errcode_t err = fn (extra, node);
1271	  if (BE (err != REG_NOERROR, 0))
1272	    return err;
1273	  if (node->parent == NULL)
1274	    return REG_NOERROR;
1275	  prev = node;
1276	  node = node->parent;
1277	}
1278      /* Go up while we have a node that is reached from the right.  */
1279      while (node->right == prev || node->right == NULL);
1280      node = node->right;
1281    }
1282}
1283
1284static reg_errcode_t
1285preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1286	  void *extra)
1287{
1288  bin_tree_t *node;
1289
1290  for (node = root; ; )
1291    {
1292      reg_errcode_t err = fn (extra, node);
1293      if (BE (err != REG_NOERROR, 0))
1294	return err;
1295
1296      /* Go to the left node, or up and to the right.  */
1297      if (node->left)
1298	node = node->left;
1299      else
1300	{
1301	  bin_tree_t *prev = NULL;
1302	  while (node->right == prev || node->right == NULL)
1303	    {
1304	      prev = node;
1305	      node = node->parent;
1306	      if (!node)
1307		return REG_NOERROR;
1308	    }
1309	  node = node->right;
1310	}
1311    }
1312}
1313
1314/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1315   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1316   backreferences as well.  Requires a preorder visit.  */
1317static reg_errcode_t
1318optimize_subexps (void *extra, bin_tree_t *node)
1319{
1320  re_dfa_t *dfa = (re_dfa_t *) extra;
1321
1322  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1323    {
1324      int idx = node->token.opr.idx;
1325      node->token.opr.idx = dfa->subexp_map[idx];
1326      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1327    }
1328
1329  else if (node->token.type == SUBEXP
1330	   && node->left && node->left->token.type == SUBEXP)
1331    {
1332      Idx other_idx = node->left->token.opr.idx;
1333
1334      node->left = node->left->left;
1335      if (node->left)
1336	node->left->parent = node;
1337
1338      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1339      if (other_idx < BITSET_WORD_BITS)
1340	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1341    }
1342
1343  return REG_NOERROR;
1344}
1345
1346/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1347   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1348static reg_errcode_t
1349lower_subexps (void *extra, bin_tree_t *node)
1350{
1351  regex_t *preg = (regex_t *) extra;
1352  reg_errcode_t err = REG_NOERROR;
1353
1354  if (node->left && node->left->token.type == SUBEXP)
1355    {
1356      node->left = lower_subexp (&err, preg, node->left);
1357      if (node->left)
1358	node->left->parent = node;
1359    }
1360  if (node->right && node->right->token.type == SUBEXP)
1361    {
1362      node->right = lower_subexp (&err, preg, node->right);
1363      if (node->right)
1364	node->right->parent = node;
1365    }
1366
1367  return err;
1368}
1369
1370static bin_tree_t *
1371lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1372{
1373  re_dfa_t *dfa = preg->buffer;
1374  bin_tree_t *body = node->left;
1375  bin_tree_t *op, *cls, *tree1, *tree;
1376
1377  if (preg->no_sub
1378      /* We do not optimize empty subexpressions, because otherwise we may
1379	 have bad CONCAT nodes with NULL children.  This is obviously not
1380	 very common, so we do not lose much.  An example that triggers
1381	 this case is the sed "script" /\(\)/x.  */
1382      && node->left != NULL
1383      && (node->token.opr.idx >= BITSET_WORD_BITS
1384	  || !(dfa->used_bkref_map
1385	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1386    return node->left;
1387
1388  /* Convert the SUBEXP node to the concatenation of an
1389     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1390  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1391  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1392  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1393  tree = create_tree (dfa, op, tree1, CONCAT);
1394  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1395    {
1396      *err = REG_ESPACE;
1397      return NULL;
1398    }
1399
1400  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1401  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1402  return tree;
1403}
1404
1405/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1406   nodes.  Requires a postorder visit.  */
1407static reg_errcode_t
1408calc_first (void *extra, bin_tree_t *node)
1409{
1410  re_dfa_t *dfa = (re_dfa_t *) extra;
1411  if (node->token.type == CONCAT)
1412    {
1413      node->first = node->left->first;
1414      node->node_idx = node->left->node_idx;
1415    }
1416  else
1417    {
1418      node->first = node;
1419      node->node_idx = re_dfa_add_node (dfa, node->token);
1420      if (BE (node->node_idx == REG_MISSING, 0))
1421	return REG_ESPACE;
1422      if (node->token.type == ANCHOR)
1423	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1424    }
1425  return REG_NOERROR;
1426}
1427
1428/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1429static reg_errcode_t
1430calc_next (void *extra, bin_tree_t *node)
1431{
1432  switch (node->token.type)
1433    {
1434    case OP_DUP_ASTERISK:
1435      node->left->next = node;
1436      break;
1437    case CONCAT:
1438      node->left->next = node->right->first;
1439      node->right->next = node->next;
1440      break;
1441    default:
1442      if (node->left)
1443	node->left->next = node->next;
1444      if (node->right)
1445	node->right->next = node->next;
1446      break;
1447    }
1448  return REG_NOERROR;
1449}
1450
1451/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1452static reg_errcode_t
1453link_nfa_nodes (void *extra, bin_tree_t *node)
1454{
1455  re_dfa_t *dfa = (re_dfa_t *) extra;
1456  Idx idx = node->node_idx;
1457  reg_errcode_t err = REG_NOERROR;
1458
1459  switch (node->token.type)
1460    {
1461    case CONCAT:
1462      break;
1463
1464    case END_OF_RE:
1465      assert (node->next == NULL);
1466      break;
1467
1468    case OP_DUP_ASTERISK:
1469    case OP_ALT:
1470      {
1471	Idx left, right;
1472	dfa->has_plural_match = 1;
1473	if (node->left != NULL)
1474	  left = node->left->first->node_idx;
1475	else
1476	  left = node->next->node_idx;
1477	if (node->right != NULL)
1478	  right = node->right->first->node_idx;
1479	else
1480	  right = node->next->node_idx;
1481	assert (REG_VALID_INDEX (left));
1482	assert (REG_VALID_INDEX (right));
1483	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1484      }
1485      break;
1486
1487    case ANCHOR:
1488    case OP_OPEN_SUBEXP:
1489    case OP_CLOSE_SUBEXP:
1490      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1491      break;
1492
1493    case OP_BACK_REF:
1494      dfa->nexts[idx] = node->next->node_idx;
1495      if (node->token.type == OP_BACK_REF)
1496	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1497      break;
1498
1499    default:
1500      assert (!IS_EPSILON_NODE (node->token.type));
1501      dfa->nexts[idx] = node->next->node_idx;
1502      break;
1503    }
1504
1505  return err;
1506}
1507
1508/* Duplicate the epsilon closure of the node ROOT_NODE.
1509   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1510   to their own constraint.  */
1511
1512static reg_errcode_t
1513internal_function
1514duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1515			Idx root_node, unsigned int init_constraint)
1516{
1517  Idx org_node, clone_node;
1518  bool ok;
1519  unsigned int constraint = init_constraint;
1520  for (org_node = top_org_node, clone_node = top_clone_node;;)
1521    {
1522      Idx org_dest, clone_dest;
1523      if (dfa->nodes[org_node].type == OP_BACK_REF)
1524	{
1525	  /* If the back reference epsilon-transit, its destination must
1526	     also have the constraint.  Then duplicate the epsilon closure
1527	     of the destination of the back reference, and store it in
1528	     edests of the back reference.  */
1529	  org_dest = dfa->nexts[org_node];
1530	  re_node_set_empty (dfa->edests + clone_node);
1531	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1532	  if (BE (clone_dest == REG_MISSING, 0))
1533	    return REG_ESPACE;
1534	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1535	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536	  if (BE (! ok, 0))
1537	    return REG_ESPACE;
1538	}
1539      else if (dfa->edests[org_node].nelem == 0)
1540	{
1541	  /* In case of the node can't epsilon-transit, don't duplicate the
1542	     destination and store the original destination as the
1543	     destination of the node.  */
1544	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1545	  break;
1546	}
1547      else if (dfa->edests[org_node].nelem == 1)
1548	{
1549	  /* In case of the node can epsilon-transit, and it has only one
1550	     destination.  */
1551	  org_dest = dfa->edests[org_node].elems[0];
1552	  re_node_set_empty (dfa->edests + clone_node);
1553	  /* If the node is root_node itself, it means the epsilon closure
1554	     has a loop.  Then tie it to the destination of the root_node.  */
1555	  if (org_node == root_node && clone_node != org_node)
1556	    {
1557	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1558	      if (BE (! ok, 0))
1559	        return REG_ESPACE;
1560	      break;
1561	    }
1562	  /* In case the node has another constraint, append it.  */
1563	  constraint |= dfa->nodes[org_node].constraint;
1564	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1565	  if (BE (clone_dest == REG_MISSING, 0))
1566	    return REG_ESPACE;
1567	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568	  if (BE (! ok, 0))
1569	    return REG_ESPACE;
1570	}
1571      else /* dfa->edests[org_node].nelem == 2 */
1572	{
1573	  /* In case of the node can epsilon-transit, and it has two
1574	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1575	  org_dest = dfa->edests[org_node].elems[0];
1576	  re_node_set_empty (dfa->edests + clone_node);
1577	  /* Search for a duplicated node which satisfies the constraint.  */
1578	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1579	  if (clone_dest == REG_MISSING)
1580	    {
1581	      /* There is no such duplicated node, create a new one.  */
1582	      reg_errcode_t err;
1583	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1584	      if (BE (clone_dest == REG_MISSING, 0))
1585		return REG_ESPACE;
1586	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1587	      if (BE (! ok, 0))
1588		return REG_ESPACE;
1589	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1590					    root_node, constraint);
1591	      if (BE (err != REG_NOERROR, 0))
1592		return err;
1593	    }
1594	  else
1595	    {
1596	      /* There is a duplicated node which satisfies the constraint,
1597		 use it to avoid infinite loop.  */
1598	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1599	      if (BE (! ok, 0))
1600		return REG_ESPACE;
1601	    }
1602
1603	  org_dest = dfa->edests[org_node].elems[1];
1604	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1605	  if (BE (clone_dest == REG_MISSING, 0))
1606	    return REG_ESPACE;
1607	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1608	  if (BE (! ok, 0))
1609	    return REG_ESPACE;
1610	}
1611      org_node = org_dest;
1612      clone_node = clone_dest;
1613    }
1614  return REG_NOERROR;
1615}
1616
1617/* Search for a node which is duplicated from the node ORG_NODE, and
1618   satisfies the constraint CONSTRAINT.  */
1619
1620static Idx
1621search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1622			unsigned int constraint)
1623{
1624  Idx idx;
1625  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1626    {
1627      if (org_node == dfa->org_indices[idx]
1628	  && constraint == dfa->nodes[idx].constraint)
1629	return idx; /* Found.  */
1630    }
1631  return REG_MISSING; /* Not found.  */
1632}
1633
1634/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1635   Return the index of the new node, or REG_MISSING if insufficient storage is
1636   available.  */
1637
1638static Idx
1639duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1640{
1641  Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1642  if (BE (dup_idx != REG_MISSING, 1))
1643    {
1644      dfa->nodes[dup_idx].constraint = constraint;
1645      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1646      dfa->nodes[dup_idx].duplicated = 1;
1647
1648      /* Store the index of the original node.  */
1649      dfa->org_indices[dup_idx] = org_idx;
1650    }
1651  return dup_idx;
1652}
1653
1654static reg_errcode_t
1655calc_inveclosure (re_dfa_t *dfa)
1656{
1657  Idx src, idx;
1658  bool ok;
1659  for (idx = 0; idx < dfa->nodes_len; ++idx)
1660    re_node_set_init_empty (dfa->inveclosures + idx);
1661
1662  for (src = 0; src < dfa->nodes_len; ++src)
1663    {
1664      Idx *elems = dfa->eclosures[src].elems;
1665      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1666	{
1667	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1668	  if (BE (! ok, 0))
1669	    return REG_ESPACE;
1670	}
1671    }
1672
1673  return REG_NOERROR;
1674}
1675
1676/* Calculate "eclosure" for all the node in DFA.  */
1677
1678static reg_errcode_t
1679calc_eclosure (re_dfa_t *dfa)
1680{
1681  Idx node_idx;
1682  bool incomplete;
1683#ifdef DEBUG
1684  assert (dfa->nodes_len > 0);
1685#endif
1686  incomplete = false;
1687  /* For each nodes, calculate epsilon closure.  */
1688  for (node_idx = 0; ; ++node_idx)
1689    {
1690      reg_errcode_t err;
1691      re_node_set eclosure_elem;
1692      if (node_idx == dfa->nodes_len)
1693	{
1694	  if (!incomplete)
1695	    break;
1696	  incomplete = false;
1697	  node_idx = 0;
1698	}
1699
1700#ifdef DEBUG
1701      assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1702#endif
1703
1704      /* If we have already calculated, skip it.  */
1705      if (dfa->eclosures[node_idx].nelem != 0)
1706	continue;
1707      /* Calculate epsilon closure of 'node_idx'.  */
1708      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1709      if (BE (err != REG_NOERROR, 0))
1710	return err;
1711
1712      if (dfa->eclosures[node_idx].nelem == 0)
1713	{
1714	  incomplete = true;
1715	  re_node_set_free (&eclosure_elem);
1716	}
1717    }
1718  return REG_NOERROR;
1719}
1720
1721/* Calculate epsilon closure of NODE.  */
1722
1723static reg_errcode_t
1724calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1725{
1726  reg_errcode_t err;
1727  Idx i;
1728  re_node_set eclosure;
1729  bool ok;
1730  bool incomplete = false;
1731  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1732  if (BE (err != REG_NOERROR, 0))
1733    return err;
1734
1735  /* This indicates that we are calculating this node now.
1736     We reference this value to avoid infinite loop.  */
1737  dfa->eclosures[node].nelem = REG_MISSING;
1738
1739  /* If the current node has constraints, duplicate all nodes
1740     since they must inherit the constraints.  */
1741  if (dfa->nodes[node].constraint
1742      && dfa->edests[node].nelem
1743      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1744    {
1745      err = duplicate_node_closure (dfa, node, node, node,
1746				    dfa->nodes[node].constraint);
1747      if (BE (err != REG_NOERROR, 0))
1748	return err;
1749    }
1750
1751  /* Expand each epsilon destination nodes.  */
1752  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1753    for (i = 0; i < dfa->edests[node].nelem; ++i)
1754      {
1755	re_node_set eclosure_elem;
1756	Idx edest = dfa->edests[node].elems[i];
1757	/* If calculating the epsilon closure of 'edest' is in progress,
1758	   return intermediate result.  */
1759	if (dfa->eclosures[edest].nelem == REG_MISSING)
1760	  {
1761	    incomplete = true;
1762	    continue;
1763	  }
1764	/* If we haven't calculated the epsilon closure of 'edest' yet,
1765	   calculate now. Otherwise use calculated epsilon closure.  */
1766	if (dfa->eclosures[edest].nelem == 0)
1767	  {
1768	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1769	    if (BE (err != REG_NOERROR, 0))
1770	      return err;
1771	  }
1772	else
1773	  eclosure_elem = dfa->eclosures[edest];
1774	/* Merge the epsilon closure of 'edest'.  */
1775	err = re_node_set_merge (&eclosure, &eclosure_elem);
1776	if (BE (err != REG_NOERROR, 0))
1777	  return err;
1778	/* If the epsilon closure of 'edest' is incomplete,
1779	   the epsilon closure of this node is also incomplete.  */
1780	if (dfa->eclosures[edest].nelem == 0)
1781	  {
1782	    incomplete = true;
1783	    re_node_set_free (&eclosure_elem);
1784	  }
1785      }
1786
1787  /* An epsilon closure includes itself.  */
1788  ok = re_node_set_insert (&eclosure, node);
1789  if (BE (! ok, 0))
1790    return REG_ESPACE;
1791  if (incomplete && !root)
1792    dfa->eclosures[node].nelem = 0;
1793  else
1794    dfa->eclosures[node] = eclosure;
1795  *new_set = eclosure;
1796  return REG_NOERROR;
1797}
1798
1799/* Functions for token which are used in the parser.  */
1800
1801/* Fetch a token from INPUT.
1802   We must not use this function inside bracket expressions.  */
1803
1804static void
1805internal_function
1806fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1807{
1808  re_string_skip_bytes (input, peek_token (result, input, syntax));
1809}
1810
1811/* Peek a token from INPUT, and return the length of the token.
1812   We must not use this function inside bracket expressions.  */
1813
1814static int
1815internal_function
1816peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1817{
1818  unsigned char c;
1819
1820  if (re_string_eoi (input))
1821    {
1822      token->type = END_OF_RE;
1823      return 0;
1824    }
1825
1826  c = re_string_peek_byte (input, 0);
1827  token->opr.c = c;
1828
1829  token->word_char = 0;
1830#ifdef RE_ENABLE_I18N
1831  token->mb_partial = 0;
1832  if (input->mb_cur_max > 1 &&
1833      !re_string_first_byte (input, re_string_cur_idx (input)))
1834    {
1835      token->type = CHARACTER;
1836      token->mb_partial = 1;
1837      return 1;
1838    }
1839#endif
1840  if (c == '\\')
1841    {
1842      unsigned char c2;
1843      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1844	{
1845	  token->type = BACK_SLASH;
1846	  return 1;
1847	}
1848
1849      c2 = re_string_peek_byte_case (input, 1);
1850      token->opr.c = c2;
1851      token->type = CHARACTER;
1852#ifdef RE_ENABLE_I18N
1853      if (input->mb_cur_max > 1)
1854	{
1855	  wint_t wc = re_string_wchar_at (input,
1856					  re_string_cur_idx (input) + 1);
1857	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1858	}
1859      else
1860#endif
1861	token->word_char = IS_WORD_CHAR (c2) != 0;
1862
1863      switch (c2)
1864	{
1865	case '|':
1866	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1867	    token->type = OP_ALT;
1868	  break;
1869	case '1': case '2': case '3': case '4': case '5':
1870	case '6': case '7': case '8': case '9':
1871	  if (!(syntax & RE_NO_BK_REFS))
1872	    {
1873	      token->type = OP_BACK_REF;
1874	      token->opr.idx = c2 - '1';
1875	    }
1876	  break;
1877	case '<':
1878	  if (!(syntax & RE_NO_GNU_OPS))
1879	    {
1880	      token->type = ANCHOR;
1881	      token->opr.ctx_type = WORD_FIRST;
1882	    }
1883	  break;
1884	case '>':
1885	  if (!(syntax & RE_NO_GNU_OPS))
1886	    {
1887	      token->type = ANCHOR;
1888	      token->opr.ctx_type = WORD_LAST;
1889	    }
1890	  break;
1891	case 'b':
1892	  if (!(syntax & RE_NO_GNU_OPS))
1893	    {
1894	      token->type = ANCHOR;
1895	      token->opr.ctx_type = WORD_DELIM;
1896	    }
1897	  break;
1898	case 'B':
1899	  if (!(syntax & RE_NO_GNU_OPS))
1900	    {
1901	      token->type = ANCHOR;
1902	      token->opr.ctx_type = NOT_WORD_DELIM;
1903	    }
1904	  break;
1905	case 'w':
1906	  if (!(syntax & RE_NO_GNU_OPS))
1907	    token->type = OP_WORD;
1908	  break;
1909	case 'W':
1910	  if (!(syntax & RE_NO_GNU_OPS))
1911	    token->type = OP_NOTWORD;
1912	  break;
1913	case 's':
1914	  if (!(syntax & RE_NO_GNU_OPS))
1915	    token->type = OP_SPACE;
1916	  break;
1917	case 'S':
1918	  if (!(syntax & RE_NO_GNU_OPS))
1919	    token->type = OP_NOTSPACE;
1920	  break;
1921	case '`':
1922	  if (!(syntax & RE_NO_GNU_OPS))
1923	    {
1924	      token->type = ANCHOR;
1925	      token->opr.ctx_type = BUF_FIRST;
1926	    }
1927	  break;
1928	case '\'':
1929	  if (!(syntax & RE_NO_GNU_OPS))
1930	    {
1931	      token->type = ANCHOR;
1932	      token->opr.ctx_type = BUF_LAST;
1933	    }
1934	  break;
1935	case '(':
1936	  if (!(syntax & RE_NO_BK_PARENS))
1937	    token->type = OP_OPEN_SUBEXP;
1938	  break;
1939	case ')':
1940	  if (!(syntax & RE_NO_BK_PARENS))
1941	    token->type = OP_CLOSE_SUBEXP;
1942	  break;
1943	case '+':
1944	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945	    token->type = OP_DUP_PLUS;
1946	  break;
1947	case '?':
1948	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1949	    token->type = OP_DUP_QUESTION;
1950	  break;
1951	case '{':
1952	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953	    token->type = OP_OPEN_DUP_NUM;
1954	  break;
1955	case '}':
1956	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1957	    token->type = OP_CLOSE_DUP_NUM;
1958	  break;
1959	default:
1960	  break;
1961	}
1962      return 2;
1963    }
1964
1965  token->type = CHARACTER;
1966#ifdef RE_ENABLE_I18N
1967  if (input->mb_cur_max > 1)
1968    {
1969      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1970      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1971    }
1972  else
1973#endif
1974    token->word_char = IS_WORD_CHAR (token->opr.c);
1975
1976  switch (c)
1977    {
1978    case '\n':
1979      if (syntax & RE_NEWLINE_ALT)
1980	token->type = OP_ALT;
1981      break;
1982    case '|':
1983      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1984	token->type = OP_ALT;
1985      break;
1986    case '*':
1987      token->type = OP_DUP_ASTERISK;
1988      break;
1989    case '+':
1990      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991	token->type = OP_DUP_PLUS;
1992      break;
1993    case '?':
1994      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1995	token->type = OP_DUP_QUESTION;
1996      break;
1997    case '{':
1998      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999	token->type = OP_OPEN_DUP_NUM;
2000      break;
2001    case '}':
2002      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2003	token->type = OP_CLOSE_DUP_NUM;
2004      break;
2005    case '(':
2006      if (syntax & RE_NO_BK_PARENS)
2007	token->type = OP_OPEN_SUBEXP;
2008      break;
2009    case ')':
2010      if (syntax & RE_NO_BK_PARENS)
2011	token->type = OP_CLOSE_SUBEXP;
2012      break;
2013    case '[':
2014      token->type = OP_OPEN_BRACKET;
2015      break;
2016    case '.':
2017      token->type = OP_PERIOD;
2018      break;
2019    case '^':
2020      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2021	  re_string_cur_idx (input) != 0)
2022	{
2023	  char prev = re_string_peek_byte (input, -1);
2024	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2025	    break;
2026	}
2027      token->type = ANCHOR;
2028      token->opr.ctx_type = LINE_FIRST;
2029      break;
2030    case '$':
2031      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2032	  re_string_cur_idx (input) + 1 != re_string_length (input))
2033	{
2034	  re_token_t next;
2035	  re_string_skip_bytes (input, 1);
2036	  peek_token (&next, input, syntax);
2037	  re_string_skip_bytes (input, -1);
2038	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2039	    break;
2040	}
2041      token->type = ANCHOR;
2042      token->opr.ctx_type = LINE_LAST;
2043      break;
2044    default:
2045      break;
2046    }
2047  return 1;
2048}
2049
2050/* Peek a token from INPUT, and return the length of the token.
2051   We must not use this function out of bracket expressions.  */
2052
2053static int
2054internal_function
2055peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2056{
2057  unsigned char c;
2058  if (re_string_eoi (input))
2059    {
2060      token->type = END_OF_RE;
2061      return 0;
2062    }
2063  c = re_string_peek_byte (input, 0);
2064  token->opr.c = c;
2065
2066#ifdef RE_ENABLE_I18N
2067  if (input->mb_cur_max > 1 &&
2068      !re_string_first_byte (input, re_string_cur_idx (input)))
2069    {
2070      token->type = CHARACTER;
2071      return 1;
2072    }
2073#endif /* RE_ENABLE_I18N */
2074
2075  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2076      && re_string_cur_idx (input) + 1 < re_string_length (input))
2077    {
2078      /* In this case, '\' escape a character.  */
2079      unsigned char c2;
2080      re_string_skip_bytes (input, 1);
2081      c2 = re_string_peek_byte (input, 0);
2082      token->opr.c = c2;
2083      token->type = CHARACTER;
2084      return 1;
2085    }
2086  if (c == '[') /* '[' is a special char in a bracket exps.  */
2087    {
2088      unsigned char c2;
2089      int token_len;
2090      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2091	c2 = re_string_peek_byte (input, 1);
2092      else
2093	c2 = 0;
2094      token->opr.c = c2;
2095      token_len = 2;
2096      switch (c2)
2097	{
2098	case '.':
2099	  token->type = OP_OPEN_COLL_ELEM;
2100	  break;
2101	case '=':
2102	  token->type = OP_OPEN_EQUIV_CLASS;
2103	  break;
2104	case ':':
2105	  if (syntax & RE_CHAR_CLASSES)
2106	    {
2107	      token->type = OP_OPEN_CHAR_CLASS;
2108	      break;
2109	    }
2110	  /* else fall through.  */
2111	default:
2112	  token->type = CHARACTER;
2113	  token->opr.c = c;
2114	  token_len = 1;
2115	  break;
2116	}
2117      return token_len;
2118    }
2119  switch (c)
2120    {
2121    case '-':
2122      token->type = OP_CHARSET_RANGE;
2123      break;
2124    case ']':
2125      token->type = OP_CLOSE_BRACKET;
2126      break;
2127    case '^':
2128      token->type = OP_NON_MATCH_LIST;
2129      break;
2130    default:
2131      token->type = CHARACTER;
2132    }
2133  return 1;
2134}
2135
2136/* Functions for parser.  */
2137
2138/* Entry point of the parser.
2139   Parse the regular expression REGEXP and return the structure tree.
2140   If an error occurs, ERR is set by error code, and return NULL.
2141   This function build the following tree, from regular expression <reg_exp>:
2142	   CAT
2143	   / \
2144	  /   \
2145   <reg_exp>  EOR
2146
2147   CAT means concatenation.
2148   EOR means end of regular expression.  */
2149
2150static bin_tree_t *
2151parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2152       reg_errcode_t *err)
2153{
2154  re_dfa_t *dfa = preg->buffer;
2155  bin_tree_t *tree, *eor, *root;
2156  re_token_t current_token;
2157  dfa->syntax = syntax;
2158  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2159  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2160  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2161    return NULL;
2162  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2163  if (tree != NULL)
2164    root = create_tree (dfa, tree, eor, CONCAT);
2165  else
2166    root = eor;
2167  if (BE (eor == NULL || root == NULL, 0))
2168    {
2169      *err = REG_ESPACE;
2170      return NULL;
2171    }
2172  return root;
2173}
2174
2175/* This function build the following tree, from regular expression
2176   <branch1>|<branch2>:
2177	   ALT
2178	   / \
2179	  /   \
2180   <branch1> <branch2>
2181
2182   ALT means alternative, which represents the operator '|'.  */
2183
2184static bin_tree_t *
2185parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2186	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2187{
2188  re_dfa_t *dfa = preg->buffer;
2189  bin_tree_t *tree, *branch = NULL;
2190  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2191  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192    return NULL;
2193
2194  while (token->type == OP_ALT)
2195    {
2196      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2197      if (token->type != OP_ALT && token->type != END_OF_RE
2198	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2199	{
2200	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2201	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2202	    return NULL;
2203	}
2204      else
2205	branch = NULL;
2206      tree = create_tree (dfa, tree, branch, OP_ALT);
2207      if (BE (tree == NULL, 0))
2208	{
2209	  *err = REG_ESPACE;
2210	  return NULL;
2211	}
2212    }
2213  return tree;
2214}
2215
2216/* This function build the following tree, from regular expression
2217   <exp1><exp2>:
2218	CAT
2219	/ \
2220       /   \
2221   <exp1> <exp2>
2222
2223   CAT means concatenation.  */
2224
2225static bin_tree_t *
2226parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2227	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2228{
2229  bin_tree_t *tree, *expr;
2230  re_dfa_t *dfa = preg->buffer;
2231  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2232  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2233    return NULL;
2234
2235  while (token->type != OP_ALT && token->type != END_OF_RE
2236	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2237    {
2238      expr = parse_expression (regexp, preg, token, syntax, nest, err);
2239      if (BE (*err != REG_NOERROR && expr == NULL, 0))
2240	{
2241	  if (tree != NULL)
2242	    postorder (tree, free_tree, NULL);
2243	  return NULL;
2244	}
2245      if (tree != NULL && expr != NULL)
2246	{
2247	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2248	  if (newtree == NULL)
2249	    {
2250	      postorder (expr, free_tree, NULL);
2251	      postorder (tree, free_tree, NULL);
2252	      *err = REG_ESPACE;
2253	      return NULL;
2254	    }
2255	  tree = newtree;
2256	}
2257      else if (tree == NULL)
2258	tree = expr;
2259      /* Otherwise expr == NULL, we don't need to create new tree.  */
2260    }
2261  return tree;
2262}
2263
2264/* This function build the following tree, from regular expression a*:
2265	 *
2266	 |
2267	 a
2268*/
2269
2270static bin_tree_t *
2271parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2272		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2273{
2274  re_dfa_t *dfa = preg->buffer;
2275  bin_tree_t *tree;
2276  switch (token->type)
2277    {
2278    case CHARACTER:
2279      tree = create_token_tree (dfa, NULL, NULL, token);
2280      if (BE (tree == NULL, 0))
2281	{
2282	  *err = REG_ESPACE;
2283	  return NULL;
2284	}
2285#ifdef RE_ENABLE_I18N
2286      if (dfa->mb_cur_max > 1)
2287	{
2288	  while (!re_string_eoi (regexp)
2289		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2290	    {
2291	      bin_tree_t *mbc_remain;
2292	      fetch_token (token, regexp, syntax);
2293	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2294	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2295	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2296		{
2297		  *err = REG_ESPACE;
2298		  return NULL;
2299		}
2300	    }
2301	}
2302#endif
2303      break;
2304    case OP_OPEN_SUBEXP:
2305      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2306      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307	return NULL;
2308      break;
2309    case OP_OPEN_BRACKET:
2310      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2311      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2312	return NULL;
2313      break;
2314    case OP_BACK_REF:
2315      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2316	{
2317	  *err = REG_ESUBREG;
2318	  return NULL;
2319	}
2320      dfa->used_bkref_map |= 1 << token->opr.idx;
2321      tree = create_token_tree (dfa, NULL, NULL, token);
2322      if (BE (tree == NULL, 0))
2323	{
2324	  *err = REG_ESPACE;
2325	  return NULL;
2326	}
2327      ++dfa->nbackref;
2328      dfa->has_mb_node = 1;
2329      break;
2330    case OP_OPEN_DUP_NUM:
2331      if (syntax & RE_CONTEXT_INVALID_DUP)
2332	{
2333	  *err = REG_BADRPT;
2334	  return NULL;
2335	}
2336      /* FALLTHROUGH */
2337    case OP_DUP_ASTERISK:
2338    case OP_DUP_PLUS:
2339    case OP_DUP_QUESTION:
2340      if (syntax & RE_CONTEXT_INVALID_OPS)
2341	{
2342	  *err = REG_BADRPT;
2343	  return NULL;
2344	}
2345      else if (syntax & RE_CONTEXT_INDEP_OPS)
2346	{
2347	  fetch_token (token, regexp, syntax);
2348	  return parse_expression (regexp, preg, token, syntax, nest, err);
2349	}
2350      /* else fall through  */
2351    case OP_CLOSE_SUBEXP:
2352      if ((token->type == OP_CLOSE_SUBEXP) &&
2353	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2354	{
2355	  *err = REG_ERPAREN;
2356	  return NULL;
2357	}
2358      /* else fall through  */
2359    case OP_CLOSE_DUP_NUM:
2360      /* We treat it as a normal character.  */
2361
2362      /* Then we can these characters as normal characters.  */
2363      token->type = CHARACTER;
2364      /* mb_partial and word_char bits should be initialized already
2365	 by peek_token.  */
2366      tree = create_token_tree (dfa, NULL, NULL, token);
2367      if (BE (tree == NULL, 0))
2368	{
2369	  *err = REG_ESPACE;
2370	  return NULL;
2371	}
2372      break;
2373    case ANCHOR:
2374      if ((token->opr.ctx_type
2375	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2376	  && dfa->word_ops_used == 0)
2377	init_word_char (dfa);
2378      if (token->opr.ctx_type == WORD_DELIM
2379	  || token->opr.ctx_type == NOT_WORD_DELIM)
2380	{
2381	  bin_tree_t *tree_first, *tree_last;
2382	  if (token->opr.ctx_type == WORD_DELIM)
2383	    {
2384	      token->opr.ctx_type = WORD_FIRST;
2385	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2386	      token->opr.ctx_type = WORD_LAST;
2387	    }
2388	  else
2389	    {
2390	      token->opr.ctx_type = INSIDE_WORD;
2391	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2392	      token->opr.ctx_type = INSIDE_NOTWORD;
2393	    }
2394	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2395	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2396	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2397	    {
2398	      *err = REG_ESPACE;
2399	      return NULL;
2400	    }
2401	}
2402      else
2403	{
2404	  tree = create_token_tree (dfa, NULL, NULL, token);
2405	  if (BE (tree == NULL, 0))
2406	    {
2407	      *err = REG_ESPACE;
2408	      return NULL;
2409	    }
2410	}
2411      /* We must return here, since ANCHORs can't be followed
2412	 by repetition operators.
2413	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2414	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2415      fetch_token (token, regexp, syntax);
2416      return tree;
2417    case OP_PERIOD:
2418      tree = create_token_tree (dfa, NULL, NULL, token);
2419      if (BE (tree == NULL, 0))
2420	{
2421	  *err = REG_ESPACE;
2422	  return NULL;
2423	}
2424      if (dfa->mb_cur_max > 1)
2425	dfa->has_mb_node = 1;
2426      break;
2427    case OP_WORD:
2428    case OP_NOTWORD:
2429      tree = build_charclass_op (dfa, regexp->trans,
2430				 "alnum",
2431				 "_",
2432				 token->type == OP_NOTWORD, err);
2433      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2434	return NULL;
2435      break;
2436    case OP_SPACE:
2437    case OP_NOTSPACE:
2438      tree = build_charclass_op (dfa, regexp->trans,
2439				 "space",
2440				 "",
2441				 token->type == OP_NOTSPACE, err);
2442      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2443	return NULL;
2444      break;
2445    case OP_ALT:
2446    case END_OF_RE:
2447      return NULL;
2448    case BACK_SLASH:
2449      *err = REG_EESCAPE;
2450      return NULL;
2451    default:
2452      /* Must not happen?  */
2453#ifdef DEBUG
2454      assert (0);
2455#endif
2456      return NULL;
2457    }
2458  fetch_token (token, regexp, syntax);
2459
2460  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2461	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2462    {
2463      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2464      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2465	return NULL;
2466      /* In BRE consecutive duplications are not allowed.  */
2467      if ((syntax & RE_CONTEXT_INVALID_DUP)
2468	  && (token->type == OP_DUP_ASTERISK
2469	      || token->type == OP_OPEN_DUP_NUM))
2470	{
2471	  *err = REG_BADRPT;
2472	  return NULL;
2473	}
2474    }
2475
2476  return tree;
2477}
2478
2479/* This function build the following tree, from regular expression
2480   (<reg_exp>):
2481	 SUBEXP
2482	    |
2483	<reg_exp>
2484*/
2485
2486static bin_tree_t *
2487parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2488	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2489{
2490  re_dfa_t *dfa = preg->buffer;
2491  bin_tree_t *tree;
2492  size_t cur_nsub;
2493  cur_nsub = preg->re_nsub++;
2494
2495  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2496
2497  /* The subexpression may be a null string.  */
2498  if (token->type == OP_CLOSE_SUBEXP)
2499    tree = NULL;
2500  else
2501    {
2502      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2503      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2504	{
2505	  if (tree != NULL)
2506	    postorder (tree, free_tree, NULL);
2507	  *err = REG_EPAREN;
2508	}
2509      if (BE (*err != REG_NOERROR, 0))
2510	return NULL;
2511    }
2512
2513  if (cur_nsub <= '9' - '1')
2514    dfa->completed_bkref_map |= 1 << cur_nsub;
2515
2516  tree = create_tree (dfa, tree, NULL, SUBEXP);
2517  if (BE (tree == NULL, 0))
2518    {
2519      *err = REG_ESPACE;
2520      return NULL;
2521    }
2522  tree->token.opr.idx = cur_nsub;
2523  return tree;
2524}
2525
2526/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2527
2528static bin_tree_t *
2529parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2530	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2531{
2532  bin_tree_t *tree = NULL, *old_tree = NULL;
2533  Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2534  re_token_t start_token = *token;
2535
2536  if (token->type == OP_OPEN_DUP_NUM)
2537    {
2538      end = 0;
2539      start = fetch_number (regexp, token, syntax);
2540      if (start == REG_MISSING)
2541	{
2542	  if (token->type == CHARACTER && token->opr.c == ',')
2543	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2544	  else
2545	    {
2546	      *err = REG_BADBR; /* <re>{} is invalid.  */
2547	      return NULL;
2548	    }
2549	}
2550      if (BE (start != REG_ERROR, 1))
2551	{
2552	  /* We treat "{n}" as "{n,n}".  */
2553	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2554		 : ((token->type == CHARACTER && token->opr.c == ',')
2555		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2556	}
2557      if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2558	{
2559	  /* Invalid sequence.  */
2560	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2561	    {
2562	      if (token->type == END_OF_RE)
2563		*err = REG_EBRACE;
2564	      else
2565		*err = REG_BADBR;
2566
2567	      return NULL;
2568	    }
2569
2570	  /* If the syntax bit is set, rollback.  */
2571	  re_string_set_index (regexp, start_idx);
2572	  *token = start_token;
2573	  token->type = CHARACTER;
2574	  /* mb_partial and word_char bits should be already initialized by
2575	     peek_token.  */
2576	  return elem;
2577	}
2578
2579      if (BE ((end != REG_MISSING && start > end)
2580	      || token->type != OP_CLOSE_DUP_NUM, 0))
2581	{
2582	  /* First number greater than second.  */
2583	  *err = REG_BADBR;
2584	  return NULL;
2585	}
2586
2587      if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2588	{
2589	  *err = REG_ESIZE;
2590	  return NULL;
2591	}
2592    }
2593  else
2594    {
2595      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2596      end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2597    }
2598
2599  fetch_token (token, regexp, syntax);
2600
2601  if (BE (elem == NULL, 0))
2602    return NULL;
2603  if (BE (start == 0 && end == 0, 0))
2604    {
2605      postorder (elem, free_tree, NULL);
2606      return NULL;
2607    }
2608
2609  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2610  if (BE (start > 0, 0))
2611    {
2612      tree = elem;
2613      for (i = 2; i <= start; ++i)
2614	{
2615	  elem = duplicate_tree (elem, dfa);
2616	  tree = create_tree (dfa, tree, elem, CONCAT);
2617	  if (BE (elem == NULL || tree == NULL, 0))
2618	    goto parse_dup_op_espace;
2619	}
2620
2621      if (start == end)
2622	return tree;
2623
2624      /* Duplicate ELEM before it is marked optional.  */
2625      elem = duplicate_tree (elem, dfa);
2626      old_tree = tree;
2627    }
2628  else
2629    old_tree = NULL;
2630
2631  if (elem->token.type == SUBEXP)
2632    {
2633      uintptr_t subidx = elem->token.opr.idx;
2634      postorder (elem, mark_opt_subexp, (void *) subidx);
2635    }
2636
2637  tree = create_tree (dfa, elem, NULL,
2638		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2639  if (BE (tree == NULL, 0))
2640    goto parse_dup_op_espace;
2641
2642/* From gnulib's "intprops.h":
2643   True if the arithmetic type T is signed.  */
2644#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2645
2646  /* This loop is actually executed only when end != REG_MISSING,
2647     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2648     already created the start+1-th copy.  */
2649  if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2650    for (i = start + 2; i <= end; ++i)
2651      {
2652	elem = duplicate_tree (elem, dfa);
2653	tree = create_tree (dfa, tree, elem, CONCAT);
2654	if (BE (elem == NULL || tree == NULL, 0))
2655	  goto parse_dup_op_espace;
2656
2657	tree = create_tree (dfa, tree, NULL, OP_ALT);
2658	if (BE (tree == NULL, 0))
2659	  goto parse_dup_op_espace;
2660      }
2661
2662  if (old_tree)
2663    tree = create_tree (dfa, old_tree, tree, CONCAT);
2664
2665  return tree;
2666
2667 parse_dup_op_espace:
2668  *err = REG_ESPACE;
2669  return NULL;
2670}
2671
2672/* Size of the names for collating symbol/equivalence_class/character_class.
2673   I'm not sure, but maybe enough.  */
2674#define BRACKET_NAME_BUF_SIZE 32
2675
2676#ifndef _LIBC
2677  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2678     Build the range expression which starts from START_ELEM, and ends
2679     at END_ELEM.  The result are written to MBCSET and SBCSET.
2680     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2681     mbcset->range_ends, is a pointer argument since we may
2682     update it.  */
2683
2684static reg_errcode_t
2685internal_function
2686# ifdef RE_ENABLE_I18N
2687build_range_exp (const reg_syntax_t syntax,
2688                 bitset_t sbcset,
2689                 re_charset_t *mbcset,
2690                 Idx *range_alloc,
2691                 const bracket_elem_t *start_elem,
2692                 const bracket_elem_t *end_elem)
2693# else /* not RE_ENABLE_I18N */
2694build_range_exp (const reg_syntax_t syntax,
2695                 bitset_t sbcset,
2696                 const bracket_elem_t *start_elem,
2697                 const bracket_elem_t *end_elem)
2698# endif /* not RE_ENABLE_I18N */
2699{
2700  unsigned int start_ch, end_ch;
2701  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2702  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2703	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2704	  0))
2705    return REG_ERANGE;
2706
2707  /* We can handle no multi character collating elements without libc
2708     support.  */
2709  if (BE ((start_elem->type == COLL_SYM
2710	   && strlen ((char *) start_elem->opr.name) > 1)
2711	  || (end_elem->type == COLL_SYM
2712	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2713    return REG_ECOLLATE;
2714
2715# ifdef RE_ENABLE_I18N
2716  {
2717    wchar_t wc;
2718    wint_t start_wc;
2719    wint_t end_wc;
2720
2721    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2722		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2723		   : 0));
2724    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2725	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2726		 : 0));
2727    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2728		? __btowc (start_ch) : start_elem->opr.wch);
2729    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2730	      ? __btowc (end_ch) : end_elem->opr.wch);
2731    if (start_wc == WEOF || end_wc == WEOF)
2732      return REG_ECOLLATE;
2733    else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2734      return REG_ERANGE;
2735
2736    /* Got valid collation sequence values, add them as a new entry.
2737       However, for !_LIBC we have no collation elements: if the
2738       character set is single byte, the single byte character set
2739       that we build below suffices.  parse_bracket_exp passes
2740       no MBCSET if dfa->mb_cur_max == 1.  */
2741    if (mbcset)
2742      {
2743	/* Check the space of the arrays.  */
2744	if (BE (*range_alloc == mbcset->nranges, 0))
2745	  {
2746	    /* There is not enough space, need realloc.  */
2747	    wchar_t *new_array_start, *new_array_end;
2748	    Idx new_nranges;
2749
2750	    /* +1 in case of mbcset->nranges is 0.  */
2751	    new_nranges = 2 * mbcset->nranges + 1;
2752	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2753	       are NULL if *range_alloc == 0.  */
2754	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2755					  new_nranges);
2756	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2757					new_nranges);
2758
2759	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2760	      return REG_ESPACE;
2761
2762	    mbcset->range_starts = new_array_start;
2763	    mbcset->range_ends = new_array_end;
2764	    *range_alloc = new_nranges;
2765	  }
2766
2767	mbcset->range_starts[mbcset->nranges] = start_wc;
2768	mbcset->range_ends[mbcset->nranges++] = end_wc;
2769      }
2770
2771    /* Build the table for single byte characters.  */
2772    for (wc = 0; wc < SBC_MAX; ++wc)
2773      {
2774	if (start_wc <= wc && wc <= end_wc)
2775	  bitset_set (sbcset, wc);
2776      }
2777  }
2778# else /* not RE_ENABLE_I18N */
2779  {
2780    unsigned int ch;
2781    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2782		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2783		   : 0));
2784    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2785	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2786		 : 0));
2787    if (start_ch > end_ch)
2788      return REG_ERANGE;
2789    /* Build the table for single byte characters.  */
2790    for (ch = 0; ch < SBC_MAX; ++ch)
2791      if (start_ch <= ch  && ch <= end_ch)
2792	bitset_set (sbcset, ch);
2793  }
2794# endif /* not RE_ENABLE_I18N */
2795  return REG_NOERROR;
2796}
2797#endif /* not _LIBC */
2798
2799#ifndef _LIBC
2800/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2801   Build the collating element which is represented by NAME.
2802   The result are written to MBCSET and SBCSET.
2803   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2804   pointer argument since we may update it.  */
2805
2806static reg_errcode_t
2807internal_function
2808# ifdef RE_ENABLE_I18N
2809build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2810			Idx *coll_sym_alloc, const unsigned char *name)
2811# else /* not RE_ENABLE_I18N */
2812build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2813# endif /* not RE_ENABLE_I18N */
2814{
2815  size_t name_len = strlen ((const char *) name);
2816  if (BE (name_len != 1, 0))
2817    return REG_ECOLLATE;
2818  else
2819    {
2820      bitset_set (sbcset, name[0]);
2821      return REG_NOERROR;
2822    }
2823}
2824#endif /* not _LIBC */
2825
2826/* This function parse bracket expression like "[abc]", "[a-c]",
2827   "[[.a-a.]]" etc.  */
2828
2829static bin_tree_t *
2830parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2831		   reg_syntax_t syntax, reg_errcode_t *err)
2832{
2833#ifdef _LIBC
2834  const unsigned char *collseqmb;
2835  const char *collseqwc;
2836  uint32_t nrules;
2837  int32_t table_size;
2838  const int32_t *symb_table;
2839  const unsigned char *extra;
2840
2841  /* Local function for parse_bracket_exp used in _LIBC environment.
2842     Seek the collating symbol entry corresponding to NAME.
2843     Return the index of the symbol in the SYMB_TABLE,
2844     or -1 if not found.  */
2845
2846  auto inline int32_t
2847  __attribute__ ((always_inline))
2848  seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2849    {
2850      int32_t elem;
2851
2852      for (elem = 0; elem < table_size; elem++)
2853	if (symb_table[2 * elem] != 0)
2854	  {
2855	    int32_t idx = symb_table[2 * elem + 1];
2856	    /* Skip the name of collating element name.  */
2857	    idx += 1 + extra[idx];
2858	    if (/* Compare the length of the name.  */
2859		name_len == extra[idx]
2860		/* Compare the name.  */
2861		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2862	      /* Yep, this is the entry.  */
2863	      return elem;
2864	  }
2865      return -1;
2866    }
2867
2868  /* Local function for parse_bracket_exp used in _LIBC environment.
2869     Look up the collation sequence value of BR_ELEM.
2870     Return the value if succeeded, UINT_MAX otherwise.  */
2871
2872  auto inline unsigned int
2873  __attribute__ ((always_inline))
2874  lookup_collation_sequence_value (bracket_elem_t *br_elem)
2875    {
2876      if (br_elem->type == SB_CHAR)
2877	{
2878	  /*
2879	  if (MB_CUR_MAX == 1)
2880	  */
2881	  if (nrules == 0)
2882	    return collseqmb[br_elem->opr.ch];
2883	  else
2884	    {
2885	      wint_t wc = __btowc (br_elem->opr.ch);
2886	      return __collseq_table_lookup (collseqwc, wc);
2887	    }
2888	}
2889      else if (br_elem->type == MB_CHAR)
2890	{
2891	  if (nrules != 0)
2892	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2893	}
2894      else if (br_elem->type == COLL_SYM)
2895	{
2896	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2897	  if (nrules != 0)
2898	    {
2899	      int32_t elem, idx;
2900	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2901						  sym_name_len);
2902	      if (elem != -1)
2903		{
2904		  /* We found the entry.  */
2905		  idx = symb_table[2 * elem + 1];
2906		  /* Skip the name of collating element name.  */
2907		  idx += 1 + extra[idx];
2908		  /* Skip the byte sequence of the collating element.  */
2909		  idx += 1 + extra[idx];
2910		  /* Adjust for the alignment.  */
2911		  idx = (idx + 3) & ~3;
2912		  /* Skip the multibyte collation sequence value.  */
2913		  idx += sizeof (unsigned int);
2914		  /* Skip the wide char sequence of the collating element.  */
2915		  idx += sizeof (unsigned int) *
2916		    (1 + *(unsigned int *) (extra + idx));
2917		  /* Return the collation sequence value.  */
2918		  return *(unsigned int *) (extra + idx);
2919		}
2920	      else if (sym_name_len == 1)
2921		{
2922		  /* No valid character.  Match it as a single byte
2923		     character.  */
2924		  return collseqmb[br_elem->opr.name[0]];
2925		}
2926	    }
2927	  else if (sym_name_len == 1)
2928	    return collseqmb[br_elem->opr.name[0]];
2929	}
2930      return UINT_MAX;
2931    }
2932
2933  /* Local function for parse_bracket_exp used in _LIBC environment.
2934     Build the range expression which starts from START_ELEM, and ends
2935     at END_ELEM.  The result are written to MBCSET and SBCSET.
2936     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2937     mbcset->range_ends, is a pointer argument since we may
2938     update it.  */
2939
2940  auto inline reg_errcode_t
2941  __attribute__ ((always_inline))
2942  build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2943		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2944    {
2945      unsigned int ch;
2946      uint32_t start_collseq;
2947      uint32_t end_collseq;
2948
2949      /* Equivalence Classes and Character Classes can't be a range
2950	 start/end.  */
2951      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2952	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2953	      0))
2954	return REG_ERANGE;
2955
2956      /* FIXME: Implement rational ranges here, too.  */
2957      start_collseq = lookup_collation_sequence_value (start_elem);
2958      end_collseq = lookup_collation_sequence_value (end_elem);
2959      /* Check start/end collation sequence values.  */
2960      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2961	return REG_ECOLLATE;
2962      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2963	return REG_ERANGE;
2964
2965      /* Got valid collation sequence values, add them as a new entry.
2966	 However, if we have no collation elements, and the character set
2967	 is single byte, the single byte character set that we
2968	 build below suffices. */
2969      if (nrules > 0 || dfa->mb_cur_max > 1)
2970	{
2971	  /* Check the space of the arrays.  */
2972	  if (BE (*range_alloc == mbcset->nranges, 0))
2973	    {
2974	      /* There is not enough space, need realloc.  */
2975	      uint32_t *new_array_start;
2976	      uint32_t *new_array_end;
2977	      Idx new_nranges;
2978
2979	      /* +1 in case of mbcset->nranges is 0.  */
2980	      new_nranges = 2 * mbcset->nranges + 1;
2981	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2982					    new_nranges);
2983	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2984					  new_nranges);
2985
2986	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2987		return REG_ESPACE;
2988
2989	      mbcset->range_starts = new_array_start;
2990	      mbcset->range_ends = new_array_end;
2991	      *range_alloc = new_nranges;
2992	    }
2993
2994	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2995	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2996	}
2997
2998      /* Build the table for single byte characters.  */
2999      for (ch = 0; ch < SBC_MAX; ch++)
3000	{
3001	  uint32_t ch_collseq;
3002	  /*
3003	  if (MB_CUR_MAX == 1)
3004	  */
3005	  if (nrules == 0)
3006	    ch_collseq = collseqmb[ch];
3007	  else
3008	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3009	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3010	    bitset_set (sbcset, ch);
3011	}
3012      return REG_NOERROR;
3013    }
3014
3015  /* Local function for parse_bracket_exp used in _LIBC environment.
3016     Build the collating element which is represented by NAME.
3017     The result are written to MBCSET and SBCSET.
3018     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3019     pointer argument since we may update it.  */
3020
3021  auto inline reg_errcode_t
3022  __attribute__ ((always_inline))
3023  build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3024			  Idx *coll_sym_alloc, const unsigned char *name)
3025    {
3026      int32_t elem, idx;
3027      size_t name_len = strlen ((const char *) name);
3028      if (nrules != 0)
3029	{
3030	  elem = seek_collating_symbol_entry (name, name_len);
3031	  if (elem != -1)
3032	    {
3033	      /* We found the entry.  */
3034	      idx = symb_table[2 * elem + 1];
3035	      /* Skip the name of collating element name.  */
3036	      idx += 1 + extra[idx];
3037	    }
3038	  else if (name_len == 1)
3039	    {
3040	      /* No valid character, treat it as a normal
3041		 character.  */
3042	      bitset_set (sbcset, name[0]);
3043	      return REG_NOERROR;
3044	    }
3045	  else
3046	    return REG_ECOLLATE;
3047
3048	  /* Got valid collation sequence, add it as a new entry.  */
3049	  /* Check the space of the arrays.  */
3050	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3051	    {
3052	      /* Not enough, realloc it.  */
3053	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3054	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3055	      /* Use realloc since mbcset->coll_syms is NULL
3056		 if *alloc == 0.  */
3057	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3058						   new_coll_sym_alloc);
3059	      if (BE (new_coll_syms == NULL, 0))
3060		return REG_ESPACE;
3061	      mbcset->coll_syms = new_coll_syms;
3062	      *coll_sym_alloc = new_coll_sym_alloc;
3063	    }
3064	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3065	  return REG_NOERROR;
3066	}
3067      else
3068	{
3069	  if (BE (name_len != 1, 0))
3070	    return REG_ECOLLATE;
3071	  else
3072	    {
3073	      bitset_set (sbcset, name[0]);
3074	      return REG_NOERROR;
3075	    }
3076	}
3077    }
3078#endif
3079
3080  re_token_t br_token;
3081  re_bitset_ptr_t sbcset;
3082#ifdef RE_ENABLE_I18N
3083  re_charset_t *mbcset;
3084  Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3085  Idx equiv_class_alloc = 0, char_class_alloc = 0;
3086#endif /* not RE_ENABLE_I18N */
3087  bool non_match = false;
3088  bin_tree_t *work_tree;
3089  int token_len;
3090  bool first_round = true;
3091#ifdef _LIBC
3092  collseqmb = (const unsigned char *)
3093    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3094  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3095  if (nrules)
3096    {
3097      /*
3098      if (MB_CUR_MAX > 1)
3099      */
3100      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3101      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3102      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3103						  _NL_COLLATE_SYMB_TABLEMB);
3104      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3105						   _NL_COLLATE_SYMB_EXTRAMB);
3106    }
3107#endif
3108  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3109#ifdef RE_ENABLE_I18N
3110  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3111#endif /* RE_ENABLE_I18N */
3112#ifdef RE_ENABLE_I18N
3113  if (BE (sbcset == NULL || mbcset == NULL, 0))
3114#else
3115  if (BE (sbcset == NULL, 0))
3116#endif /* RE_ENABLE_I18N */
3117    {
3118      re_free (sbcset);
3119#ifdef RE_ENABLE_I18N
3120      re_free (mbcset);
3121#endif
3122      *err = REG_ESPACE;
3123      return NULL;
3124    }
3125
3126  token_len = peek_token_bracket (token, regexp, syntax);
3127  if (BE (token->type == END_OF_RE, 0))
3128    {
3129      *err = REG_BADPAT;
3130      goto parse_bracket_exp_free_return;
3131    }
3132  if (token->type == OP_NON_MATCH_LIST)
3133    {
3134#ifdef RE_ENABLE_I18N
3135      mbcset->non_match = 1;
3136#endif /* not RE_ENABLE_I18N */
3137      non_match = true;
3138      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3139	bitset_set (sbcset, '\n');
3140      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3141      token_len = peek_token_bracket (token, regexp, syntax);
3142      if (BE (token->type == END_OF_RE, 0))
3143	{
3144	  *err = REG_BADPAT;
3145	  goto parse_bracket_exp_free_return;
3146	}
3147    }
3148
3149  /* We treat the first ']' as a normal character.  */
3150  if (token->type == OP_CLOSE_BRACKET)
3151    token->type = CHARACTER;
3152
3153  while (1)
3154    {
3155      bracket_elem_t start_elem, end_elem;
3156      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3157      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3158      reg_errcode_t ret;
3159      int token_len2 = 0;
3160      bool is_range_exp = false;
3161      re_token_t token2;
3162
3163      start_elem.opr.name = start_name_buf;
3164      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3165				   syntax, first_round);
3166      if (BE (ret != REG_NOERROR, 0))
3167	{
3168	  *err = ret;
3169	  goto parse_bracket_exp_free_return;
3170	}
3171      first_round = false;
3172
3173      /* Get information about the next token.  We need it in any case.  */
3174      token_len = peek_token_bracket (token, regexp, syntax);
3175
3176      /* Do not check for ranges if we know they are not allowed.  */
3177      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3178	{
3179	  if (BE (token->type == END_OF_RE, 0))
3180	    {
3181	      *err = REG_EBRACK;
3182	      goto parse_bracket_exp_free_return;
3183	    }
3184	  if (token->type == OP_CHARSET_RANGE)
3185	    {
3186	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3187	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3188	      if (BE (token2.type == END_OF_RE, 0))
3189		{
3190		  *err = REG_EBRACK;
3191		  goto parse_bracket_exp_free_return;
3192		}
3193	      if (token2.type == OP_CLOSE_BRACKET)
3194		{
3195		  /* We treat the last '-' as a normal character.  */
3196		  re_string_skip_bytes (regexp, -token_len);
3197		  token->type = CHARACTER;
3198		}
3199	      else
3200		is_range_exp = true;
3201	    }
3202	}
3203
3204      if (is_range_exp == true)
3205	{
3206	  end_elem.opr.name = end_name_buf;
3207	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3208				       dfa, syntax, true);
3209	  if (BE (ret != REG_NOERROR, 0))
3210	    {
3211	      *err = ret;
3212	      goto parse_bracket_exp_free_return;
3213	    }
3214
3215	  token_len = peek_token_bracket (token, regexp, syntax);
3216
3217#ifdef _LIBC
3218	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3219				  &start_elem, &end_elem);
3220#else
3221# ifdef RE_ENABLE_I18N
3222	  *err = build_range_exp (syntax, sbcset,
3223				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3224				  &range_alloc, &start_elem, &end_elem);
3225# else
3226	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3227# endif
3228#endif /* RE_ENABLE_I18N */
3229	  if (BE (*err != REG_NOERROR, 0))
3230	    goto parse_bracket_exp_free_return;
3231	}
3232      else
3233	{
3234	  switch (start_elem.type)
3235	    {
3236	    case SB_CHAR:
3237	      bitset_set (sbcset, start_elem.opr.ch);
3238	      break;
3239#ifdef RE_ENABLE_I18N
3240	    case MB_CHAR:
3241	      /* Check whether the array has enough space.  */
3242	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3243		{
3244		  wchar_t *new_mbchars;
3245		  /* Not enough, realloc it.  */
3246		  /* +1 in case of mbcset->nmbchars is 0.  */
3247		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3248		  /* Use realloc since array is NULL if *alloc == 0.  */
3249		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3250					    mbchar_alloc);
3251		  if (BE (new_mbchars == NULL, 0))
3252		    goto parse_bracket_exp_espace;
3253		  mbcset->mbchars = new_mbchars;
3254		}
3255	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3256	      break;
3257#endif /* RE_ENABLE_I18N */
3258	    case EQUIV_CLASS:
3259	      *err = build_equiv_class (sbcset,
3260#ifdef RE_ENABLE_I18N
3261					mbcset, &equiv_class_alloc,
3262#endif /* RE_ENABLE_I18N */
3263					start_elem.opr.name);
3264	      if (BE (*err != REG_NOERROR, 0))
3265		goto parse_bracket_exp_free_return;
3266	      break;
3267	    case COLL_SYM:
3268	      *err = build_collating_symbol (sbcset,
3269#ifdef RE_ENABLE_I18N
3270					     mbcset, &coll_sym_alloc,
3271#endif /* RE_ENABLE_I18N */
3272					     start_elem.opr.name);
3273	      if (BE (*err != REG_NOERROR, 0))
3274		goto parse_bracket_exp_free_return;
3275	      break;
3276	    case CHAR_CLASS:
3277	      *err = build_charclass (regexp->trans, sbcset,
3278#ifdef RE_ENABLE_I18N
3279				      mbcset, &char_class_alloc,
3280#endif /* RE_ENABLE_I18N */
3281				      (const char *) start_elem.opr.name,
3282				      syntax);
3283	      if (BE (*err != REG_NOERROR, 0))
3284	       goto parse_bracket_exp_free_return;
3285	      break;
3286	    default:
3287	      assert (0);
3288	      break;
3289	    }
3290	}
3291      if (BE (token->type == END_OF_RE, 0))
3292	{
3293	  *err = REG_EBRACK;
3294	  goto parse_bracket_exp_free_return;
3295	}
3296      if (token->type == OP_CLOSE_BRACKET)
3297	break;
3298    }
3299
3300  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3301
3302  /* If it is non-matching list.  */
3303  if (non_match)
3304    bitset_not (sbcset);
3305
3306#ifdef RE_ENABLE_I18N
3307  /* Ensure only single byte characters are set.  */
3308  if (dfa->mb_cur_max > 1)
3309    bitset_mask (sbcset, dfa->sb_char);
3310
3311  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3312      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3313						     || mbcset->non_match)))
3314    {
3315      bin_tree_t *mbc_tree;
3316      int sbc_idx;
3317      /* Build a tree for complex bracket.  */
3318      dfa->has_mb_node = 1;
3319      br_token.type = COMPLEX_BRACKET;
3320      br_token.opr.mbcset = mbcset;
3321      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3322      if (BE (mbc_tree == NULL, 0))
3323	goto parse_bracket_exp_espace;
3324      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3325	if (sbcset[sbc_idx])
3326	  break;
3327      /* If there are no bits set in sbcset, there is no point
3328	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3329      if (sbc_idx < BITSET_WORDS)
3330	{
3331	  /* Build a tree for simple bracket.  */
3332	  br_token.type = SIMPLE_BRACKET;
3333	  br_token.opr.sbcset = sbcset;
3334	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3335	  if (BE (work_tree == NULL, 0))
3336	    goto parse_bracket_exp_espace;
3337
3338	  /* Then join them by ALT node.  */
3339	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3340	  if (BE (work_tree == NULL, 0))
3341	    goto parse_bracket_exp_espace;
3342	}
3343      else
3344	{
3345	  re_free (sbcset);
3346	  work_tree = mbc_tree;
3347	}
3348    }
3349  else
3350#endif /* not RE_ENABLE_I18N */
3351    {
3352#ifdef RE_ENABLE_I18N
3353      free_charset (mbcset);
3354#endif
3355      /* Build a tree for simple bracket.  */
3356      br_token.type = SIMPLE_BRACKET;
3357      br_token.opr.sbcset = sbcset;
3358      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359      if (BE (work_tree == NULL, 0))
3360	goto parse_bracket_exp_espace;
3361    }
3362  return work_tree;
3363
3364 parse_bracket_exp_espace:
3365  *err = REG_ESPACE;
3366 parse_bracket_exp_free_return:
3367  re_free (sbcset);
3368#ifdef RE_ENABLE_I18N
3369  free_charset (mbcset);
3370#endif /* RE_ENABLE_I18N */
3371  return NULL;
3372}
3373
3374/* Parse an element in the bracket expression.  */
3375
3376static reg_errcode_t
3377parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3378		       re_token_t *token, int token_len, re_dfa_t *dfa,
3379		       reg_syntax_t syntax, bool accept_hyphen)
3380{
3381#ifdef RE_ENABLE_I18N
3382  int cur_char_size;
3383  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3384  if (cur_char_size > 1)
3385    {
3386      elem->type = MB_CHAR;
3387      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3388      re_string_skip_bytes (regexp, cur_char_size);
3389      return REG_NOERROR;
3390    }
3391#endif /* RE_ENABLE_I18N */
3392  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3393  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3394      || token->type == OP_OPEN_EQUIV_CLASS)
3395    return parse_bracket_symbol (elem, regexp, token);
3396  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3397    {
3398      /* A '-' must only appear as anything but a range indicator before
3399	 the closing bracket.  Everything else is an error.  */
3400      re_token_t token2;
3401      (void) peek_token_bracket (&token2, regexp, syntax);
3402      if (token2.type != OP_CLOSE_BRACKET)
3403	/* The actual error value is not standardized since this whole
3404	   case is undefined.  But ERANGE makes good sense.  */
3405	return REG_ERANGE;
3406    }
3407  elem->type = SB_CHAR;
3408  elem->opr.ch = token->opr.c;
3409  return REG_NOERROR;
3410}
3411
3412/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3413   such as [:<character_class>:], [.<collating_element>.], and
3414   [=<equivalent_class>=].  */
3415
3416static reg_errcode_t
3417parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3418		      re_token_t *token)
3419{
3420  unsigned char ch, delim = token->opr.c;
3421  int i = 0;
3422  if (re_string_eoi(regexp))
3423    return REG_EBRACK;
3424  for (;; ++i)
3425    {
3426      if (i >= BRACKET_NAME_BUF_SIZE)
3427	return REG_EBRACK;
3428      if (token->type == OP_OPEN_CHAR_CLASS)
3429	ch = re_string_fetch_byte_case (regexp);
3430      else
3431	ch = re_string_fetch_byte (regexp);
3432      if (re_string_eoi(regexp))
3433	return REG_EBRACK;
3434      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3435	break;
3436      elem->opr.name[i] = ch;
3437    }
3438  re_string_skip_bytes (regexp, 1);
3439  elem->opr.name[i] = '\0';
3440  switch (token->type)
3441    {
3442    case OP_OPEN_COLL_ELEM:
3443      elem->type = COLL_SYM;
3444      break;
3445    case OP_OPEN_EQUIV_CLASS:
3446      elem->type = EQUIV_CLASS;
3447      break;
3448    case OP_OPEN_CHAR_CLASS:
3449      elem->type = CHAR_CLASS;
3450      break;
3451    default:
3452      break;
3453    }
3454  return REG_NOERROR;
3455}
3456
3457  /* Helper function for parse_bracket_exp.
3458     Build the equivalence class which is represented by NAME.
3459     The result are written to MBCSET and SBCSET.
3460     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3461     is a pointer argument since we may update it.  */
3462
3463static reg_errcode_t
3464#ifdef RE_ENABLE_I18N
3465build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3466		   Idx *equiv_class_alloc, const unsigned char *name)
3467#else /* not RE_ENABLE_I18N */
3468build_equiv_class (bitset_t sbcset, const unsigned char *name)
3469#endif /* not RE_ENABLE_I18N */
3470{
3471#ifdef _LIBC
3472  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3473  if (nrules != 0)
3474    {
3475      const int32_t *table, *indirect;
3476      const unsigned char *weights, *extra, *cp;
3477      unsigned char char_buf[2];
3478      int32_t idx1, idx2;
3479      unsigned int ch;
3480      size_t len;
3481      /* This #include defines a local function!  */
3482# include <locale/weight.h>
3483      /* Calculate the index for equivalence class.  */
3484      cp = name;
3485      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3486      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3487					       _NL_COLLATE_WEIGHTMB);
3488      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3489						   _NL_COLLATE_EXTRAMB);
3490      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3491						_NL_COLLATE_INDIRECTMB);
3492      idx1 = findidx (&cp, -1);
3493      if (BE (idx1 == 0 || *cp != '\0', 0))
3494	/* This isn't a valid character.  */
3495	return REG_ECOLLATE;
3496
3497      /* Build single byte matching table for this equivalence class.  */
3498      len = weights[idx1 & 0xffffff];
3499      for (ch = 0; ch < SBC_MAX; ++ch)
3500	{
3501	  char_buf[0] = ch;
3502	  cp = char_buf;
3503	  idx2 = findidx (&cp, 1);
3504/*
3505	  idx2 = table[ch];
3506*/
3507	  if (idx2 == 0)
3508	    /* This isn't a valid character.  */
3509	    continue;
3510	  /* Compare only if the length matches and the collation rule
3511	     index is the same.  */
3512	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3513	    {
3514	      int cnt = 0;
3515
3516	      while (cnt <= len &&
3517		     weights[(idx1 & 0xffffff) + 1 + cnt]
3518		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3519		++cnt;
3520
3521	      if (cnt > len)
3522		bitset_set (sbcset, ch);
3523	    }
3524	}
3525      /* Check whether the array has enough space.  */
3526      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3527	{
3528	  /* Not enough, realloc it.  */
3529	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3530	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3531	  /* Use realloc since the array is NULL if *alloc == 0.  */
3532	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3533						   int32_t,
3534						   new_equiv_class_alloc);
3535	  if (BE (new_equiv_classes == NULL, 0))
3536	    return REG_ESPACE;
3537	  mbcset->equiv_classes = new_equiv_classes;
3538	  *equiv_class_alloc = new_equiv_class_alloc;
3539	}
3540      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3541    }
3542  else
3543#endif /* _LIBC */
3544    {
3545      if (BE (strlen ((const char *) name) != 1, 0))
3546	return REG_ECOLLATE;
3547      bitset_set (sbcset, *name);
3548    }
3549  return REG_NOERROR;
3550}
3551
3552  /* Helper function for parse_bracket_exp.
3553     Build the character class which is represented by NAME.
3554     The result are written to MBCSET and SBCSET.
3555     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3556     is a pointer argument since we may update it.  */
3557
3558static reg_errcode_t
3559#ifdef RE_ENABLE_I18N
3560build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3561		 re_charset_t *mbcset, Idx *char_class_alloc,
3562		 const char *class_name, reg_syntax_t syntax)
3563#else /* not RE_ENABLE_I18N */
3564build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3565		 const char *class_name, reg_syntax_t syntax)
3566#endif /* not RE_ENABLE_I18N */
3567{
3568  int i;
3569  const char *name = class_name;
3570
3571  /* In case of REG_ICASE "upper" and "lower" match the both of
3572     upper and lower cases.  */
3573  if ((syntax & RE_ICASE)
3574      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3575    name = "alpha";
3576
3577#ifdef RE_ENABLE_I18N
3578  /* Check the space of the arrays.  */
3579  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3580    {
3581      /* Not enough, realloc it.  */
3582      /* +1 in case of mbcset->nchar_classes is 0.  */
3583      Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3584      /* Use realloc since array is NULL if *alloc == 0.  */
3585      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3586					       new_char_class_alloc);
3587      if (BE (new_char_classes == NULL, 0))
3588	return REG_ESPACE;
3589      mbcset->char_classes = new_char_classes;
3590      *char_class_alloc = new_char_class_alloc;
3591    }
3592  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3593#endif /* RE_ENABLE_I18N */
3594
3595#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3596  do {						\
3597    if (BE (trans != NULL, 0))			\
3598      {						\
3599	for (i = 0; i < SBC_MAX; ++i)		\
3600	  if (ctype_func (i))			\
3601	    bitset_set (sbcset, trans[i]);	\
3602      }						\
3603    else					\
3604      {						\
3605	for (i = 0; i < SBC_MAX; ++i)		\
3606	  if (ctype_func (i))			\
3607	    bitset_set (sbcset, i);		\
3608      }						\
3609  } while (0)
3610
3611  if (strcmp (name, "alnum") == 0)
3612    BUILD_CHARCLASS_LOOP (isalnum);
3613  else if (strcmp (name, "cntrl") == 0)
3614    BUILD_CHARCLASS_LOOP (iscntrl);
3615  else if (strcmp (name, "lower") == 0)
3616    BUILD_CHARCLASS_LOOP (islower);
3617  else if (strcmp (name, "space") == 0)
3618    BUILD_CHARCLASS_LOOP (isspace);
3619  else if (strcmp (name, "alpha") == 0)
3620    BUILD_CHARCLASS_LOOP (isalpha);
3621  else if (strcmp (name, "digit") == 0)
3622    BUILD_CHARCLASS_LOOP (isdigit);
3623  else if (strcmp (name, "print") == 0)
3624    BUILD_CHARCLASS_LOOP (isprint);
3625  else if (strcmp (name, "upper") == 0)
3626    BUILD_CHARCLASS_LOOP (isupper);
3627  else if (strcmp (name, "blank") == 0)
3628    BUILD_CHARCLASS_LOOP (isblank);
3629  else if (strcmp (name, "graph") == 0)
3630    BUILD_CHARCLASS_LOOP (isgraph);
3631  else if (strcmp (name, "punct") == 0)
3632    BUILD_CHARCLASS_LOOP (ispunct);
3633  else if (strcmp (name, "xdigit") == 0)
3634    BUILD_CHARCLASS_LOOP (isxdigit);
3635  else
3636    return REG_ECTYPE;
3637
3638  return REG_NOERROR;
3639}
3640
3641static bin_tree_t *
3642build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3643		    const char *class_name,
3644		    const char *extra, bool non_match,
3645		    reg_errcode_t *err)
3646{
3647  re_bitset_ptr_t sbcset;
3648#ifdef RE_ENABLE_I18N
3649  re_charset_t *mbcset;
3650  Idx alloc = 0;
3651#endif /* not RE_ENABLE_I18N */
3652  reg_errcode_t ret;
3653  re_token_t br_token;
3654  bin_tree_t *tree;
3655
3656  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3657#ifdef RE_ENABLE_I18N
3658  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3659#endif /* RE_ENABLE_I18N */
3660
3661#ifdef RE_ENABLE_I18N
3662  if (BE (sbcset == NULL || mbcset == NULL, 0))
3663#else /* not RE_ENABLE_I18N */
3664  if (BE (sbcset == NULL, 0))
3665#endif /* not RE_ENABLE_I18N */
3666    {
3667      *err = REG_ESPACE;
3668      return NULL;
3669    }
3670
3671  if (non_match)
3672    {
3673#ifdef RE_ENABLE_I18N
3674      mbcset->non_match = 1;
3675#endif /* not RE_ENABLE_I18N */
3676    }
3677
3678  /* We don't care the syntax in this case.  */
3679  ret = build_charclass (trans, sbcset,
3680#ifdef RE_ENABLE_I18N
3681			 mbcset, &alloc,
3682#endif /* RE_ENABLE_I18N */
3683			 class_name, 0);
3684
3685  if (BE (ret != REG_NOERROR, 0))
3686    {
3687      re_free (sbcset);
3688#ifdef RE_ENABLE_I18N
3689      free_charset (mbcset);
3690#endif /* RE_ENABLE_I18N */
3691      *err = ret;
3692      return NULL;
3693    }
3694  /* \w match '_' also.  */
3695  for (; *extra; extra++)
3696    bitset_set (sbcset, *extra);
3697
3698  /* If it is non-matching list.  */
3699  if (non_match)
3700    bitset_not (sbcset);
3701
3702#ifdef RE_ENABLE_I18N
3703  /* Ensure only single byte characters are set.  */
3704  if (dfa->mb_cur_max > 1)
3705    bitset_mask (sbcset, dfa->sb_char);
3706#endif
3707
3708  /* Build a tree for simple bracket.  */
3709  br_token.type = SIMPLE_BRACKET;
3710  br_token.opr.sbcset = sbcset;
3711  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3712  if (BE (tree == NULL, 0))
3713    goto build_word_op_espace;
3714
3715#ifdef RE_ENABLE_I18N
3716  if (dfa->mb_cur_max > 1)
3717    {
3718      bin_tree_t *mbc_tree;
3719      /* Build a tree for complex bracket.  */
3720      br_token.type = COMPLEX_BRACKET;
3721      br_token.opr.mbcset = mbcset;
3722      dfa->has_mb_node = 1;
3723      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3724      if (BE (mbc_tree == NULL, 0))
3725	goto build_word_op_espace;
3726      /* Then join them by ALT node.  */
3727      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3728      if (BE (mbc_tree != NULL, 1))
3729	return tree;
3730    }
3731  else
3732    {
3733      free_charset (mbcset);
3734      return tree;
3735    }
3736#else /* not RE_ENABLE_I18N */
3737  return tree;
3738#endif /* not RE_ENABLE_I18N */
3739
3740 build_word_op_espace:
3741  re_free (sbcset);
3742#ifdef RE_ENABLE_I18N
3743  free_charset (mbcset);
3744#endif /* RE_ENABLE_I18N */
3745  *err = REG_ESPACE;
3746  return NULL;
3747}
3748
3749/* This is intended for the expressions like "a{1,3}".
3750   Fetch a number from 'input', and return the number.
3751   Return REG_MISSING if the number field is empty like "{,1}".
3752   Return RE_DUP_MAX + 1 if the number field is too large.
3753   Return REG_ERROR if an error occurred.  */
3754
3755static Idx
3756fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3757{
3758  Idx num = REG_MISSING;
3759  unsigned char c;
3760  while (1)
3761    {
3762      fetch_token (token, input, syntax);
3763      c = token->opr.c;
3764      if (BE (token->type == END_OF_RE, 0))
3765	return REG_ERROR;
3766      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3767	break;
3768      num = ((token->type != CHARACTER || c < '0' || '9' < c
3769	      || num == REG_ERROR)
3770	     ? REG_ERROR
3771	     : num == REG_MISSING
3772	     ? c - '0'
3773	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3774    }
3775  return num;
3776}
3777
3778#ifdef RE_ENABLE_I18N
3779static void
3780free_charset (re_charset_t *cset)
3781{
3782  re_free (cset->mbchars);
3783# ifdef _LIBC
3784  re_free (cset->coll_syms);
3785  re_free (cset->equiv_classes);
3786  re_free (cset->range_starts);
3787  re_free (cset->range_ends);
3788# endif
3789  re_free (cset->char_classes);
3790  re_free (cset);
3791}
3792#endif /* RE_ENABLE_I18N */
3793
3794/* Functions for binary tree operation.  */
3795
3796/* Create a tree node.  */
3797
3798static bin_tree_t *
3799create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3800	     re_token_type_t type)
3801{
3802  re_token_t t;
3803  t.type = type;
3804  return create_token_tree (dfa, left, right, &t);
3805}
3806
3807static bin_tree_t *
3808create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3809		   const re_token_t *token)
3810{
3811  bin_tree_t *tree;
3812  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3813    {
3814      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3815
3816      if (storage == NULL)
3817	return NULL;
3818      storage->next = dfa->str_tree_storage;
3819      dfa->str_tree_storage = storage;
3820      dfa->str_tree_storage_idx = 0;
3821    }
3822  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3823
3824  tree->parent = NULL;
3825  tree->left = left;
3826  tree->right = right;
3827  tree->token = *token;
3828  tree->token.duplicated = 0;
3829  tree->token.opt_subexp = 0;
3830  tree->first = NULL;
3831  tree->next = NULL;
3832  tree->node_idx = REG_MISSING;
3833
3834  if (left != NULL)
3835    left->parent = tree;
3836  if (right != NULL)
3837    right->parent = tree;
3838  return tree;
3839}
3840
3841/* Mark the tree SRC as an optional subexpression.
3842   To be called from preorder or postorder.  */
3843
3844static reg_errcode_t
3845mark_opt_subexp (void *extra, bin_tree_t *node)
3846{
3847  Idx idx = (uintptr_t) extra;
3848  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849    node->token.opt_subexp = 1;
3850
3851  return REG_NOERROR;
3852}
3853
3854/* Free the allocated memory inside NODE. */
3855
3856static void
3857free_token (re_token_t *node)
3858{
3859#ifdef RE_ENABLE_I18N
3860  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861    free_charset (node->opr.mbcset);
3862  else
3863#endif /* RE_ENABLE_I18N */
3864    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865      re_free (node->opr.sbcset);
3866}
3867
3868/* Worker function for tree walking.  Free the allocated memory inside NODE
3869   and its children. */
3870
3871static reg_errcode_t
3872free_tree (void *extra, bin_tree_t *node)
3873{
3874  free_token (&node->token);
3875  return REG_NOERROR;
3876}
3877
3878
3879/* Duplicate the node SRC, and return new node.  This is a preorder
3880   visit similar to the one implemented by the generic visitor, but
3881   we need more infrastructure to maintain two parallel trees --- so,
3882   it's easier to duplicate.  */
3883
3884static bin_tree_t *
3885duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3886{
3887  const bin_tree_t *node;
3888  bin_tree_t *dup_root;
3889  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3890
3891  for (node = root; ; )
3892    {
3893      /* Create a new tree and link it back to the current parent.  */
3894      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3895      if (*p_new == NULL)
3896	return NULL;
3897      (*p_new)->parent = dup_node;
3898      (*p_new)->token.duplicated = 1;
3899      dup_node = *p_new;
3900
3901      /* Go to the left node, or up and to the right.  */
3902      if (node->left)
3903	{
3904	  node = node->left;
3905	  p_new = &dup_node->left;
3906	}
3907      else
3908	{
3909	  const bin_tree_t *prev = NULL;
3910	  while (node->right == prev || node->right == NULL)
3911	    {
3912	      prev = node;
3913	      node = node->parent;
3914	      dup_node = dup_node->parent;
3915	      if (!node)
3916		return dup_root;
3917	    }
3918	  node = node->right;
3919	  p_new = &dup_node->right;
3920	}
3921    }
3922}
3923