1/* -*- buffer-read-only: t -*- vi: set ro: */
2/* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3#line 1
4/* Extended regular expression matching and search library.
5   Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
6   Software Foundation, Inc.
7   This file is part of the GNU C Library.
8   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
9
10   This program is free software; you can redistribute it and/or modify
11   it under the terms of the GNU General Public License as published by
12   the Free Software Foundation; either version 3, or (at your option)
13   any later version.
14
15   This program is distributed in the hope that it will be useful,
16   but WITHOUT ANY WARRANTY; without even the implied warranty of
17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18   GNU General Public License for more details.
19
20   You should have received a copy of the GNU General Public License along
21   with this program; if not, write to the Free Software Foundation,
22   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
23
24#include "intprops.h"
25#include "verify.h"
26static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
27					  size_t length, reg_syntax_t syntax);
28static void re_compile_fastmap_iter (regex_t *bufp,
29				     const re_dfastate_t *init_state,
30				     char *fastmap);
31static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
32#ifdef RE_ENABLE_I18N
33static void free_charset (re_charset_t *cset);
34#endif /* RE_ENABLE_I18N */
35static void free_workarea_compile (regex_t *preg);
36static reg_errcode_t create_initial_state (re_dfa_t *dfa);
37#ifdef RE_ENABLE_I18N
38static void optimize_utf8 (re_dfa_t *dfa);
39#endif
40static reg_errcode_t analyze (regex_t *preg);
41static reg_errcode_t preorder (bin_tree_t *root,
42			       reg_errcode_t (fn (void *, bin_tree_t *)),
43			       void *extra);
44static reg_errcode_t postorder (bin_tree_t *root,
45				reg_errcode_t (fn (void *, bin_tree_t *)),
46				void *extra);
47static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
48static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
49static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
50				 bin_tree_t *node);
51static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
52static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
53static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
54static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
55static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
56				   unsigned int constraint);
57static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
58static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
59					 Idx node, bool root);
60static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
61static Idx fetch_number (re_string_t *input, re_token_t *token,
62			 reg_syntax_t syntax);
63static int peek_token (re_token_t *token, re_string_t *input,
64			reg_syntax_t syntax) internal_function;
65static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
66			  reg_syntax_t syntax, reg_errcode_t *err);
67static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
68				  re_token_t *token, reg_syntax_t syntax,
69				  Idx nest, reg_errcode_t *err);
70static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
71				 re_token_t *token, reg_syntax_t syntax,
72				 Idx nest, reg_errcode_t *err);
73static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
74				     re_token_t *token, reg_syntax_t syntax,
75				     Idx nest, reg_errcode_t *err);
76static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
77				  re_token_t *token, reg_syntax_t syntax,
78				  Idx nest, reg_errcode_t *err);
79static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
80				 re_dfa_t *dfa, re_token_t *token,
81				 reg_syntax_t syntax, reg_errcode_t *err);
82static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
83				      re_token_t *token, reg_syntax_t syntax,
84				      reg_errcode_t *err);
85static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
86					    re_string_t *regexp,
87					    re_token_t *token, int token_len,
88					    re_dfa_t *dfa,
89					    reg_syntax_t syntax,
90					    bool accept_hyphen);
91static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
92					  re_string_t *regexp,
93					  re_token_t *token);
94#ifdef RE_ENABLE_I18N
95static reg_errcode_t build_equiv_class (bitset_t sbcset,
96					re_charset_t *mbcset,
97					Idx *equiv_class_alloc,
98					const unsigned char *name);
99static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
100				      bitset_t sbcset,
101				      re_charset_t *mbcset,
102				      Idx *char_class_alloc,
103				      const unsigned char *class_name,
104				      reg_syntax_t syntax);
105#else  /* not RE_ENABLE_I18N */
106static reg_errcode_t build_equiv_class (bitset_t sbcset,
107					const unsigned char *name);
108static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
109				      bitset_t sbcset,
110				      const unsigned char *class_name,
111				      reg_syntax_t syntax);
112#endif /* not RE_ENABLE_I18N */
113static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
114				       RE_TRANSLATE_TYPE trans,
115				       const unsigned char *class_name,
116				       const unsigned char *extra,
117				       bool non_match, reg_errcode_t *err);
118static bin_tree_t *create_tree (re_dfa_t *dfa,
119				bin_tree_t *left, bin_tree_t *right,
120				re_token_type_t type);
121static bin_tree_t *create_token_tree (re_dfa_t *dfa,
122				      bin_tree_t *left, bin_tree_t *right,
123				      const re_token_t *token);
124static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
125static void free_token (re_token_t *node);
126static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
127static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
128
129/* This table gives an error message for each of the error codes listed
130   in regex.h.  Obviously the order here has to be same as there.
131   POSIX doesn't require that we do anything for REG_NOERROR,
132   but why not be nice?  */
133
134static const char __re_error_msgid[] =
135  {
136#define REG_NOERROR_IDX	0
137    gettext_noop ("Success")	/* REG_NOERROR */
138    "\0"
139#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
140    gettext_noop ("No match")	/* REG_NOMATCH */
141    "\0"
142#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
143    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
144    "\0"
145#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
146    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
147    "\0"
148#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
149    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
150    "\0"
151#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
152    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
153    "\0"
154#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
155    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
156    "\0"
157#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
158    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
159    "\0"
160#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
161    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
162    "\0"
163#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
164    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
165    "\0"
166#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
167    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
168    "\0"
169#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
170    gettext_noop ("Invalid range end")	/* REG_ERANGE */
171    "\0"
172#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
173    gettext_noop ("Memory exhausted") /* REG_ESPACE */
174    "\0"
175#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
176    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
177    "\0"
178#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
179    gettext_noop ("Premature end of regular expression") /* REG_EEND */
180    "\0"
181#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
182    gettext_noop ("Regular expression too big") /* REG_ESIZE */
183    "\0"
184#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
185    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186  };
187
188static const size_t __re_error_msgid_idx[] =
189  {
190    REG_NOERROR_IDX,
191    REG_NOMATCH_IDX,
192    REG_BADPAT_IDX,
193    REG_ECOLLATE_IDX,
194    REG_ECTYPE_IDX,
195    REG_EESCAPE_IDX,
196    REG_ESUBREG_IDX,
197    REG_EBRACK_IDX,
198    REG_EPAREN_IDX,
199    REG_EBRACE_IDX,
200    REG_BADBR_IDX,
201    REG_ERANGE_IDX,
202    REG_ESPACE_IDX,
203    REG_BADRPT_IDX,
204    REG_EEND_IDX,
205    REG_ESIZE_IDX,
206    REG_ERPAREN_IDX
207  };
208
209/* Entry points for GNU code.  */
210
211/* re_compile_pattern is the GNU regular expression compiler: it
212   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213   Returns 0 if the pattern was valid, otherwise an error string.
214
215   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
216   are set in BUFP on entry.  */
217
218#ifdef _LIBC
219const char *
220re_compile_pattern (pattern, length, bufp)
221    const char *pattern;
222    size_t length;
223    struct re_pattern_buffer *bufp;
224#else /* size_t might promote */
225const char *
226re_compile_pattern (const char *pattern, size_t length,
227		    struct re_pattern_buffer *bufp)
228#endif
229{
230  reg_errcode_t ret;
231
232  /* And GNU code determines whether or not to get register information
233     by passing null for the REGS argument to re_match, etc., not by
234     setting no_sub, unless RE_NO_SUB is set.  */
235  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
236
237  /* Match anchors at newline.  */
238  bufp->newline_anchor = 1;
239
240  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
241
242  if (!ret)
243    return NULL;
244  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
245}
246#ifdef _LIBC
247weak_alias (__re_compile_pattern, re_compile_pattern)
248#endif
249
250/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
251   also be assigned to arbitrarily: each pattern buffer stores its own
252   syntax, so it can be changed between regex compilations.  */
253/* This has no initializer because initialized variables in Emacs
254   become read-only after dumping.  */
255reg_syntax_t re_syntax_options;
256
257
258/* Specify the precise syntax of regexps for compilation.  This provides
259   for compatibility for various utilities which historically have
260   different, incompatible syntaxes.
261
262   The argument SYNTAX is a bit mask comprised of the various bits
263   defined in regex.h.  We return the old syntax.  */
264
265reg_syntax_t
266re_set_syntax (syntax)
267    reg_syntax_t syntax;
268{
269  reg_syntax_t ret = re_syntax_options;
270
271  re_syntax_options = syntax;
272  return ret;
273}
274#ifdef _LIBC
275weak_alias (__re_set_syntax, re_set_syntax)
276#endif
277
278int
279re_compile_fastmap (bufp)
280    struct re_pattern_buffer *bufp;
281{
282  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
283  char *fastmap = bufp->fastmap;
284
285  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
286  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
287  if (dfa->init_state != dfa->init_state_word)
288    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
289  if (dfa->init_state != dfa->init_state_nl)
290    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
291  if (dfa->init_state != dfa->init_state_begbuf)
292    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
293  bufp->fastmap_accurate = 1;
294  return 0;
295}
296#ifdef _LIBC
297weak_alias (__re_compile_fastmap, re_compile_fastmap)
298#endif
299
300static inline void
301__attribute ((always_inline))
302re_set_fastmap (char *fastmap, bool icase, int ch)
303{
304  fastmap[ch] = 1;
305  if (icase)
306    fastmap[tolower (ch)] = 1;
307}
308
309/* Helper function for re_compile_fastmap.
310   Compile fastmap for the initial_state INIT_STATE.  */
311
312static void
313re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
314			 char *fastmap)
315{
316  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
317  Idx node_cnt;
318  bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
319  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
320    {
321      Idx node = init_state->nodes.elems[node_cnt];
322      re_token_type_t type = dfa->nodes[node].type;
323
324      if (type == CHARACTER)
325	{
326	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
327#ifdef RE_ENABLE_I18N
328	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
329	    {
330	      unsigned char buf[MB_LEN_MAX];
331	      unsigned char *p;
332	      wchar_t wc;
333	      mbstate_t state;
334
335	      p = buf;
336	      *p++ = dfa->nodes[node].opr.c;
337	      while (++node < dfa->nodes_len
338		     &&	dfa->nodes[node].type == CHARACTER
339		     && dfa->nodes[node].mb_partial)
340		*p++ = dfa->nodes[node].opr.c;
341	      memset (&state, '\0', sizeof (state));
342	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
343			     &state) == p - buf
344		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
345		      != (size_t) -1))
346		re_set_fastmap (fastmap, false, buf[0]);
347	    }
348#endif
349	}
350      else if (type == SIMPLE_BRACKET)
351	{
352	  int i, ch;
353	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
354	    {
355	      int j;
356	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
357	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
358		if (w & ((bitset_word_t) 1 << j))
359		  re_set_fastmap (fastmap, icase, ch);
360	    }
361	}
362#ifdef RE_ENABLE_I18N
363      else if (type == COMPLEX_BRACKET)
364	{
365	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
366	  Idx i;
367
368# ifdef _LIBC
369	  /* See if we have to try all bytes which start multiple collation
370	     elements.
371	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
372		  collation element, and don't catch 'b' since 'b' is
373		  the only collation element which starts from 'b' (and
374		  it is caught by SIMPLE_BRACKET).  */
375	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
376		  && (cset->ncoll_syms || cset->nranges))
377		{
378		  const int32_t *table = (const int32_t *)
379		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
380		  for (i = 0; i < SBC_MAX; ++i)
381		    if (table[i] < 0)
382		      re_set_fastmap (fastmap, icase, i);
383		}
384# endif /* _LIBC */
385
386	  /* See if we have to start the match at all multibyte characters,
387	     i.e. where we would not find an invalid sequence.  This only
388	     applies to multibyte character sets; for single byte character
389	     sets, the SIMPLE_BRACKET again suffices.  */
390	  if (dfa->mb_cur_max > 1
391	      && (cset->nchar_classes || cset->non_match || cset->nranges
392# ifdef _LIBC
393		  || cset->nequiv_classes
394# endif /* _LIBC */
395		 ))
396	    {
397	      unsigned char c = 0;
398	      do
399		{
400		  mbstate_t mbs;
401		  memset (&mbs, 0, sizeof (mbs));
402		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
403		    re_set_fastmap (fastmap, false, (int) c);
404		}
405	      while (++c != 0);
406	    }
407
408	  else
409	    {
410	      /* ... Else catch all bytes which can start the mbchars.  */
411	      for (i = 0; i < cset->nmbchars; ++i)
412		{
413		  char buf[256];
414		  mbstate_t state;
415		  memset (&state, '\0', sizeof (state));
416		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
417		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
418		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
419		    {
420		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
421			  != (size_t) -1)
422			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
423		    }
424		}
425	    }
426	}
427#endif /* RE_ENABLE_I18N */
428      else if (type == OP_PERIOD
429#ifdef RE_ENABLE_I18N
430	       || type == OP_UTF8_PERIOD
431#endif /* RE_ENABLE_I18N */
432	       || type == END_OF_RE)
433	{
434	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
435	  if (type == END_OF_RE)
436	    bufp->can_be_null = 1;
437	  return;
438	}
439    }
440}
441
442/* Entry point for POSIX code.  */
443/* regcomp takes a regular expression as a string and compiles it.
444
445   PREG is a regex_t *.  We do not expect any fields to be initialized,
446   since POSIX says we shouldn't.  Thus, we set
447
448     `buffer' to the compiled pattern;
449     `used' to the length of the compiled pattern;
450     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
451       REG_EXTENDED bit in CFLAGS is set; otherwise, to
452       RE_SYNTAX_POSIX_BASIC;
453     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
454     `fastmap' to an allocated space for the fastmap;
455     `fastmap_accurate' to zero;
456     `re_nsub' to the number of subexpressions in PATTERN.
457
458   PATTERN is the address of the pattern string.
459
460   CFLAGS is a series of bits which affect compilation.
461
462     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
463     use POSIX basic syntax.
464
465     If REG_NEWLINE is set, then . and [^...] don't match newline.
466     Also, regexec will try a match beginning after every newline.
467
468     If REG_ICASE is set, then we considers upper- and lowercase
469     versions of letters to be equivalent when matching.
470
471     If REG_NOSUB is set, then when PREG is passed to regexec, that
472     routine will report only success or failure, and nothing about the
473     registers.
474
475   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
476   the return codes and their meanings.)  */
477
478int
479regcomp (preg, pattern, cflags)
480    regex_t *_Restrict_ preg;
481    const char *_Restrict_ pattern;
482    int cflags;
483{
484  reg_errcode_t ret;
485  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
486			 : RE_SYNTAX_POSIX_BASIC);
487
488  preg->buffer = NULL;
489  preg->allocated = 0;
490  preg->used = 0;
491
492  /* Try to allocate space for the fastmap.  */
493  preg->fastmap = re_malloc (char, SBC_MAX);
494  if (BE (preg->fastmap == NULL, 0))
495    return REG_ESPACE;
496
497  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
498
499  /* If REG_NEWLINE is set, newlines are treated differently.  */
500  if (cflags & REG_NEWLINE)
501    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
502      syntax &= ~RE_DOT_NEWLINE;
503      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
504      /* It also changes the matching behavior.  */
505      preg->newline_anchor = 1;
506    }
507  else
508    preg->newline_anchor = 0;
509  preg->no_sub = !!(cflags & REG_NOSUB);
510  preg->translate = NULL;
511
512  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
513
514  /* POSIX doesn't distinguish between an unmatched open-group and an
515     unmatched close-group: both are REG_EPAREN.  */
516  if (ret == REG_ERPAREN)
517    ret = REG_EPAREN;
518
519  /* We have already checked preg->fastmap != NULL.  */
520  if (BE (ret == REG_NOERROR, 1))
521    /* Compute the fastmap now, since regexec cannot modify the pattern
522       buffer.  This function never fails in this implementation.  */
523    (void) re_compile_fastmap (preg);
524  else
525    {
526      /* Some error occurred while compiling the expression.  */
527      re_free (preg->fastmap);
528      preg->fastmap = NULL;
529    }
530
531  return (int) ret;
532}
533#ifdef _LIBC
534weak_alias (__regcomp, regcomp)
535#endif
536
537/* Returns a message corresponding to an error code, ERRCODE, returned
538   from either regcomp or regexec.   We don't use PREG here.  */
539
540#ifdef _LIBC
541size_t
542regerror (errcode, preg, errbuf, errbuf_size)
543    int errcode;
544    const regex_t *_Restrict_ preg;
545    char *_Restrict_ errbuf;
546    size_t errbuf_size;
547#else /* size_t might promote */
548size_t
549regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_,
550	  char *_Restrict_ errbuf, size_t errbuf_size)
551#endif
552{
553  const char *msg;
554  size_t msg_size;
555
556  if (BE (errcode < 0
557	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
558			       / sizeof (__re_error_msgid_idx[0])), 0))
559    /* Only error codes returned by the rest of the code should be passed
560       to this routine.  If we are given anything else, or if other regex
561       code generates an invalid error code, then the program has a bug.
562       Dump core so we can fix it.  */
563    abort ();
564
565  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
566
567  msg_size = strlen (msg) + 1; /* Includes the null.  */
568
569  if (BE (errbuf_size != 0, 1))
570    {
571      size_t cpy_size = msg_size;
572      if (BE (msg_size > errbuf_size, 0))
573	{
574	  cpy_size = errbuf_size - 1;
575	  errbuf[cpy_size] = '\0';
576	}
577      memcpy (errbuf, msg, cpy_size);
578    }
579
580  return msg_size;
581}
582#ifdef _LIBC
583weak_alias (__regerror, regerror)
584#endif
585
586
587#ifdef RE_ENABLE_I18N
588/* This static array is used for the map to single-byte characters when
589   UTF-8 is used.  Otherwise we would allocate memory just to initialize
590   it the same all the time.  UTF-8 is the preferred encoding so this is
591   a worthwhile optimization.  */
592static const bitset_t utf8_sb_map =
593{
594  /* Set the first 128 bits.  */
595# if 4 * BITSET_WORD_BITS < ASCII_CHARS
596#  error "bitset_word_t is narrower than 32 bits"
597# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
598  BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
599# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
600  BITSET_WORD_MAX, BITSET_WORD_MAX,
601# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
602  BITSET_WORD_MAX,
603# endif
604  (BITSET_WORD_MAX
605   >> (SBC_MAX % BITSET_WORD_BITS == 0
606       ? 0
607       : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
608};
609#endif
610
611
612static void
613free_dfa_content (re_dfa_t *dfa)
614{
615  Idx i, j;
616
617  if (dfa->nodes)
618    for (i = 0; i < dfa->nodes_len; ++i)
619      free_token (dfa->nodes + i);
620  re_free (dfa->nexts);
621  for (i = 0; i < dfa->nodes_len; ++i)
622    {
623      if (dfa->eclosures != NULL)
624	re_node_set_free (dfa->eclosures + i);
625      if (dfa->inveclosures != NULL)
626	re_node_set_free (dfa->inveclosures + i);
627      if (dfa->edests != NULL)
628	re_node_set_free (dfa->edests + i);
629    }
630  re_free (dfa->edests);
631  re_free (dfa->eclosures);
632  re_free (dfa->inveclosures);
633  re_free (dfa->nodes);
634
635  if (dfa->state_table)
636    for (i = 0; i <= dfa->state_hash_mask; ++i)
637      {
638	struct re_state_table_entry *entry = dfa->state_table + i;
639	for (j = 0; j < entry->num; ++j)
640	  {
641	    re_dfastate_t *state = entry->array[j];
642	    free_state (state);
643	  }
644        re_free (entry->array);
645      }
646  re_free (dfa->state_table);
647#ifdef RE_ENABLE_I18N
648  if (dfa->sb_char != utf8_sb_map)
649    re_free (dfa->sb_char);
650#endif
651  re_free (dfa->subexp_map);
652#ifdef DEBUG
653  re_free (dfa->re_str);
654#endif
655
656  re_free (dfa);
657}
658
659
660/* Free dynamically allocated space used by PREG.  */
661
662void
663regfree (preg)
664    regex_t *preg;
665{
666  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
667  if (BE (dfa != NULL, 1))
668    free_dfa_content (dfa);
669  preg->buffer = NULL;
670  preg->allocated = 0;
671
672  re_free (preg->fastmap);
673  preg->fastmap = NULL;
674
675  re_free (preg->translate);
676  preg->translate = NULL;
677}
678#ifdef _LIBC
679weak_alias (__regfree, regfree)
680#endif
681
682/* Entry points compatible with 4.2 BSD regex library.  We don't define
683   them unless specifically requested.  */
684
685#if defined _REGEX_RE_COMP || defined _LIBC
686
687/* BSD has one and only one pattern buffer.  */
688static struct re_pattern_buffer re_comp_buf;
689
690char *
691# ifdef _LIBC
692/* Make these definitions weak in libc, so POSIX programs can redefine
693   these names if they don't use our functions, and still use
694   regcomp/regexec above without link errors.  */
695weak_function
696# endif
697re_comp (s)
698     const char *s;
699{
700  reg_errcode_t ret;
701  char *fastmap;
702
703  if (!s)
704    {
705      if (!re_comp_buf.buffer)
706	return gettext ("No previous regular expression");
707      return 0;
708    }
709
710  if (re_comp_buf.buffer)
711    {
712      fastmap = re_comp_buf.fastmap;
713      re_comp_buf.fastmap = NULL;
714      __regfree (&re_comp_buf);
715      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
716      re_comp_buf.fastmap = fastmap;
717    }
718
719  if (re_comp_buf.fastmap == NULL)
720    {
721      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
722      if (re_comp_buf.fastmap == NULL)
723	return (char *) gettext (__re_error_msgid
724				 + __re_error_msgid_idx[(int) REG_ESPACE]);
725    }
726
727  /* Since `re_exec' always passes NULL for the `regs' argument, we
728     don't need to initialize the pattern buffer fields which affect it.  */
729
730  /* Match anchors at newlines.  */
731  re_comp_buf.newline_anchor = 1;
732
733  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
734
735  if (!ret)
736    return NULL;
737
738  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
739  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
740}
741
742#ifdef _LIBC
743libc_freeres_fn (free_mem)
744{
745  __regfree (&re_comp_buf);
746}
747#endif
748
749#endif /* _REGEX_RE_COMP */
750
751/* Internal entry point.
752   Compile the regular expression PATTERN, whose length is LENGTH.
753   SYNTAX indicate regular expression's syntax.  */
754
755static reg_errcode_t
756re_compile_internal (regex_t *preg, const char * pattern, size_t length,
757		     reg_syntax_t syntax)
758{
759  reg_errcode_t err = REG_NOERROR;
760  re_dfa_t *dfa;
761  re_string_t regexp;
762
763  /* Initialize the pattern buffer.  */
764  preg->fastmap_accurate = 0;
765  preg->syntax = syntax;
766  preg->not_bol = preg->not_eol = 0;
767  preg->used = 0;
768  preg->re_nsub = 0;
769  preg->can_be_null = 0;
770  preg->regs_allocated = REGS_UNALLOCATED;
771
772  /* Initialize the dfa.  */
773  dfa = (re_dfa_t *) preg->buffer;
774  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
775    {
776      /* If zero allocated, but buffer is non-null, try to realloc
777	 enough space.  This loses if buffer's address is bogus, but
778	 that is the user's responsibility.  If ->buffer is NULL this
779	 is a simple allocation.  */
780      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
781      if (dfa == NULL)
782	return REG_ESPACE;
783      preg->allocated = sizeof (re_dfa_t);
784      preg->buffer = (unsigned char *) dfa;
785    }
786  preg->used = sizeof (re_dfa_t);
787
788  err = init_dfa (dfa, length);
789  if (BE (err != REG_NOERROR, 0))
790    {
791      free_dfa_content (dfa);
792      preg->buffer = NULL;
793      preg->allocated = 0;
794      return err;
795    }
796#ifdef DEBUG
797  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
798  dfa->re_str = re_malloc (char, length + 1);
799  strncpy (dfa->re_str, pattern, length + 1);
800#endif
801
802  __libc_lock_init (dfa->lock);
803
804  err = re_string_construct (&regexp, pattern, length, preg->translate,
805			     (syntax & RE_ICASE) != 0, dfa);
806  if (BE (err != REG_NOERROR, 0))
807    {
808    re_compile_internal_free_return:
809      free_workarea_compile (preg);
810      re_string_destruct (&regexp);
811      free_dfa_content (dfa);
812      preg->buffer = NULL;
813      preg->allocated = 0;
814      return err;
815    }
816
817  /* Parse the regular expression, and build a structure tree.  */
818  preg->re_nsub = 0;
819  dfa->str_tree = parse (&regexp, preg, syntax, &err);
820  if (BE (dfa->str_tree == NULL, 0))
821    goto re_compile_internal_free_return;
822
823  /* Analyze the tree and create the nfa.  */
824  err = analyze (preg);
825  if (BE (err != REG_NOERROR, 0))
826    goto re_compile_internal_free_return;
827
828#ifdef RE_ENABLE_I18N
829  /* If possible, do searching in single byte encoding to speed things up.  */
830  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
831    optimize_utf8 (dfa);
832#endif
833
834  /* Then create the initial state of the dfa.  */
835  err = create_initial_state (dfa);
836
837  /* Release work areas.  */
838  free_workarea_compile (preg);
839  re_string_destruct (&regexp);
840
841  if (BE (err != REG_NOERROR, 0))
842    {
843      free_dfa_content (dfa);
844      preg->buffer = NULL;
845      preg->allocated = 0;
846    }
847
848  return err;
849}
850
851/* Initialize DFA.  We use the length of the regular expression PAT_LEN
852   as the initial length of some arrays.  */
853
854static reg_errcode_t
855init_dfa (re_dfa_t *dfa, size_t pat_len)
856{
857  __re_size_t table_size;
858#ifndef _LIBC
859  char *codeset_name;
860#endif
861#ifdef RE_ENABLE_I18N
862  size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
863#else
864  size_t max_i18n_object_size = 0;
865#endif
866  size_t max_object_size =
867    MAX (sizeof (struct re_state_table_entry),
868	 MAX (sizeof (re_token_t),
869	      MAX (sizeof (re_node_set),
870		   MAX (sizeof (regmatch_t),
871			max_i18n_object_size))));
872
873  memset (dfa, '\0', sizeof (re_dfa_t));
874
875  /* Force allocation of str_tree_storage the first time.  */
876  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
877
878  /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
879     calculation below, and for similar doubling calculations
880     elsewhere.  And it's <= rather than <, because some of the
881     doubling calculations add 1 afterwards.  */
882  if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
883    return REG_ESPACE;
884
885  dfa->nodes_alloc = pat_len + 1;
886  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
887
888  /*  table_size = 2 ^ ceil(log pat_len) */
889  for (table_size = 1; ; table_size <<= 1)
890    if (table_size > pat_len)
891      break;
892
893  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
894  dfa->state_hash_mask = table_size - 1;
895
896  dfa->mb_cur_max = MB_CUR_MAX;
897#ifdef _LIBC
898  if (dfa->mb_cur_max == 6
899      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
900    dfa->is_utf8 = 1;
901  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
902		       != 0);
903#else
904  codeset_name = nl_langinfo (CODESET);
905  if (strcasecmp (codeset_name, "UTF-8") == 0
906      || strcasecmp (codeset_name, "UTF8") == 0)
907    dfa->is_utf8 = 1;
908
909  /* We check exhaustively in the loop below if this charset is a
910     superset of ASCII.  */
911  dfa->map_notascii = 0;
912#endif
913
914#ifdef RE_ENABLE_I18N
915  if (dfa->mb_cur_max > 1)
916    {
917      if (dfa->is_utf8)
918	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
919      else
920	{
921	  int i, j, ch;
922
923	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
924	  if (BE (dfa->sb_char == NULL, 0))
925	    return REG_ESPACE;
926
927	  /* Set the bits corresponding to single byte chars.  */
928	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
929	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
930	      {
931		wint_t wch = __btowc (ch);
932		if (wch != WEOF)
933		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
934# ifndef _LIBC
935		if (isascii (ch) && wch != ch)
936		  dfa->map_notascii = 1;
937# endif
938	      }
939	}
940    }
941#endif
942
943  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
944    return REG_ESPACE;
945  return REG_NOERROR;
946}
947
948/* Initialize WORD_CHAR table, which indicate which character is
949   "word".  In this case "word" means that it is the word construction
950   character used by some operators like "\<", "\>", etc.  */
951
952static void
953internal_function
954init_word_char (re_dfa_t *dfa)
955{
956  int i, j, ch;
957  dfa->word_ops_used = 1;
958  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
959    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
960      if (isalnum (ch) || ch == '_')
961	dfa->word_char[i] |= (bitset_word_t) 1 << j;
962}
963
964/* Free the work area which are only used while compiling.  */
965
966static void
967free_workarea_compile (regex_t *preg)
968{
969  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
970  bin_tree_storage_t *storage, *next;
971  for (storage = dfa->str_tree_storage; storage; storage = next)
972    {
973      next = storage->next;
974      re_free (storage);
975    }
976  dfa->str_tree_storage = NULL;
977  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
978  dfa->str_tree = NULL;
979  re_free (dfa->org_indices);
980  dfa->org_indices = NULL;
981}
982
983/* Create initial states for all contexts.  */
984
985static reg_errcode_t
986create_initial_state (re_dfa_t *dfa)
987{
988  Idx first, i;
989  reg_errcode_t err;
990  re_node_set init_nodes;
991
992  /* Initial states have the epsilon closure of the node which is
993     the first node of the regular expression.  */
994  first = dfa->str_tree->first->node_idx;
995  dfa->init_node = first;
996  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
997  if (BE (err != REG_NOERROR, 0))
998    return err;
999
1000  /* The back-references which are in initial states can epsilon transit,
1001     since in this case all of the subexpressions can be null.
1002     Then we add epsilon closures of the nodes which are the next nodes of
1003     the back-references.  */
1004  if (dfa->nbackref > 0)
1005    for (i = 0; i < init_nodes.nelem; ++i)
1006      {
1007	Idx node_idx = init_nodes.elems[i];
1008	re_token_type_t type = dfa->nodes[node_idx].type;
1009
1010	Idx clexp_idx;
1011	if (type != OP_BACK_REF)
1012	  continue;
1013	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1014	  {
1015	    re_token_t *clexp_node;
1016	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1017	    if (clexp_node->type == OP_CLOSE_SUBEXP
1018		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1019	      break;
1020	  }
1021	if (clexp_idx == init_nodes.nelem)
1022	  continue;
1023
1024	if (type == OP_BACK_REF)
1025	  {
1026	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1027	    if (!re_node_set_contains (&init_nodes, dest_idx))
1028	      {
1029		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1030		i = 0;
1031	      }
1032	  }
1033      }
1034
1035  /* It must be the first time to invoke acquire_state.  */
1036  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1037  /* We don't check ERR here, since the initial state must not be NULL.  */
1038  if (BE (dfa->init_state == NULL, 0))
1039    return err;
1040  if (dfa->init_state->has_constraint)
1041    {
1042      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1043						       CONTEXT_WORD);
1044      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1045						     CONTEXT_NEWLINE);
1046      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1047							 &init_nodes,
1048							 CONTEXT_NEWLINE
1049							 | CONTEXT_BEGBUF);
1050      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1051	      || dfa->init_state_begbuf == NULL, 0))
1052	return err;
1053    }
1054  else
1055    dfa->init_state_word = dfa->init_state_nl
1056      = dfa->init_state_begbuf = dfa->init_state;
1057
1058  re_node_set_free (&init_nodes);
1059  return REG_NOERROR;
1060}
1061
1062#ifdef RE_ENABLE_I18N
1063/* If it is possible to do searching in single byte encoding instead of UTF-8
1064   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1065   DFA nodes where needed.  */
1066
1067static void
1068optimize_utf8 (re_dfa_t *dfa)
1069{
1070  Idx node;
1071  int i;
1072  bool mb_chars = false;
1073  bool has_period = false;
1074
1075  for (node = 0; node < dfa->nodes_len; ++node)
1076    switch (dfa->nodes[node].type)
1077      {
1078      case CHARACTER:
1079	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1080	  mb_chars = true;
1081	break;
1082      case ANCHOR:
1083	switch (dfa->nodes[node].opr.ctx_type)
1084	  {
1085	  case LINE_FIRST:
1086	  case LINE_LAST:
1087	  case BUF_FIRST:
1088	  case BUF_LAST:
1089	    break;
1090	  default:
1091	    /* Word anchors etc. cannot be handled.  It's okay to test
1092	       opr.ctx_type since constraints (for all DFA nodes) are
1093	       created by ORing one or more opr.ctx_type values.  */
1094	    return;
1095	  }
1096	break;
1097      case OP_PERIOD:
1098        has_period = true;
1099        break;
1100      case OP_BACK_REF:
1101      case OP_ALT:
1102      case END_OF_RE:
1103      case OP_DUP_ASTERISK:
1104      case OP_OPEN_SUBEXP:
1105      case OP_CLOSE_SUBEXP:
1106	break;
1107      case COMPLEX_BRACKET:
1108	return;
1109      case SIMPLE_BRACKET:
1110	/* Just double check.  */
1111	{
1112	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1113			? 0
1114			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1115	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1116	    {
1117	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1118		return;
1119	      rshift = 0;
1120	    }
1121	}
1122	break;
1123      default:
1124	abort ();
1125      }
1126
1127  if (mb_chars || has_period)
1128    for (node = 0; node < dfa->nodes_len; ++node)
1129      {
1130	if (dfa->nodes[node].type == CHARACTER
1131	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1132	  dfa->nodes[node].mb_partial = 0;
1133	else if (dfa->nodes[node].type == OP_PERIOD)
1134	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1135      }
1136
1137  /* The search can be in single byte locale.  */
1138  dfa->mb_cur_max = 1;
1139  dfa->is_utf8 = 0;
1140  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1141}
1142#endif
1143
1144/* Analyze the structure tree, and calculate "first", "next", "edest",
1145   "eclosure", and "inveclosure".  */
1146
1147static reg_errcode_t
1148analyze (regex_t *preg)
1149{
1150  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1151  reg_errcode_t ret;
1152
1153  /* Allocate arrays.  */
1154  dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1155  dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1156  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1157  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1158  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1159	  || dfa->eclosures == NULL, 0))
1160    return REG_ESPACE;
1161
1162  dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1163  if (dfa->subexp_map != NULL)
1164    {
1165      Idx i;
1166      for (i = 0; i < preg->re_nsub; i++)
1167	dfa->subexp_map[i] = i;
1168      preorder (dfa->str_tree, optimize_subexps, dfa);
1169      for (i = 0; i < preg->re_nsub; i++)
1170	if (dfa->subexp_map[i] != i)
1171	  break;
1172      if (i == preg->re_nsub)
1173	{
1174	  free (dfa->subexp_map);
1175	  dfa->subexp_map = NULL;
1176	}
1177    }
1178
1179  ret = postorder (dfa->str_tree, lower_subexps, preg);
1180  if (BE (ret != REG_NOERROR, 0))
1181    return ret;
1182  ret = postorder (dfa->str_tree, calc_first, dfa);
1183  if (BE (ret != REG_NOERROR, 0))
1184    return ret;
1185  preorder (dfa->str_tree, calc_next, dfa);
1186  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1187  if (BE (ret != REG_NOERROR, 0))
1188    return ret;
1189  ret = calc_eclosure (dfa);
1190  if (BE (ret != REG_NOERROR, 0))
1191    return ret;
1192
1193  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1194     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1195  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1196      || dfa->nbackref)
1197    {
1198      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1199      if (BE (dfa->inveclosures == NULL, 0))
1200        return REG_ESPACE;
1201      ret = calc_inveclosure (dfa);
1202    }
1203
1204  return ret;
1205}
1206
1207/* Our parse trees are very unbalanced, so we cannot use a stack to
1208   implement parse tree visits.  Instead, we use parent pointers and
1209   some hairy code in these two functions.  */
1210static reg_errcode_t
1211postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1212	   void *extra)
1213{
1214  bin_tree_t *node, *prev;
1215
1216  for (node = root; ; )
1217    {
1218      /* Descend down the tree, preferably to the left (or to the right
1219	 if that's the only child).  */
1220      while (node->left || node->right)
1221	if (node->left)
1222          node = node->left;
1223        else
1224          node = node->right;
1225
1226      do
1227	{
1228	  reg_errcode_t err = fn (extra, node);
1229	  if (BE (err != REG_NOERROR, 0))
1230	    return err;
1231          if (node->parent == NULL)
1232	    return REG_NOERROR;
1233	  prev = node;
1234	  node = node->parent;
1235	}
1236      /* Go up while we have a node that is reached from the right.  */
1237      while (node->right == prev || node->right == NULL);
1238      node = node->right;
1239    }
1240}
1241
1242static reg_errcode_t
1243preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1244	  void *extra)
1245{
1246  bin_tree_t *node;
1247
1248  for (node = root; ; )
1249    {
1250      reg_errcode_t err = fn (extra, node);
1251      if (BE (err != REG_NOERROR, 0))
1252	return err;
1253
1254      /* Go to the left node, or up and to the right.  */
1255      if (node->left)
1256	node = node->left;
1257      else
1258	{
1259	  bin_tree_t *prev = NULL;
1260	  while (node->right == prev || node->right == NULL)
1261	    {
1262	      prev = node;
1263	      node = node->parent;
1264	      if (!node)
1265	        return REG_NOERROR;
1266	    }
1267	  node = node->right;
1268	}
1269    }
1270}
1271
1272/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1273   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1274   backreferences as well.  Requires a preorder visit.  */
1275static reg_errcode_t
1276optimize_subexps (void *extra, bin_tree_t *node)
1277{
1278  re_dfa_t *dfa = (re_dfa_t *) extra;
1279
1280  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1281    {
1282      int idx = node->token.opr.idx;
1283      node->token.opr.idx = dfa->subexp_map[idx];
1284      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285    }
1286
1287  else if (node->token.type == SUBEXP
1288           && node->left && node->left->token.type == SUBEXP)
1289    {
1290      Idx other_idx = node->left->token.opr.idx;
1291
1292      node->left = node->left->left;
1293      if (node->left)
1294        node->left->parent = node;
1295
1296      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1297      if (other_idx < BITSET_WORD_BITS)
1298	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1299    }
1300
1301  return REG_NOERROR;
1302}
1303
1304/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1305   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1306static reg_errcode_t
1307lower_subexps (void *extra, bin_tree_t *node)
1308{
1309  regex_t *preg = (regex_t *) extra;
1310  reg_errcode_t err = REG_NOERROR;
1311
1312  if (node->left && node->left->token.type == SUBEXP)
1313    {
1314      node->left = lower_subexp (&err, preg, node->left);
1315      if (node->left)
1316	node->left->parent = node;
1317    }
1318  if (node->right && node->right->token.type == SUBEXP)
1319    {
1320      node->right = lower_subexp (&err, preg, node->right);
1321      if (node->right)
1322	node->right->parent = node;
1323    }
1324
1325  return err;
1326}
1327
1328static bin_tree_t *
1329lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1330{
1331  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1332  bin_tree_t *body = node->left;
1333  bin_tree_t *op, *cls, *tree1, *tree;
1334
1335  if (preg->no_sub
1336      /* We do not optimize empty subexpressions, because otherwise we may
1337	 have bad CONCAT nodes with NULL children.  This is obviously not
1338	 very common, so we do not lose much.  An example that triggers
1339	 this case is the sed "script" /\(\)/x.  */
1340      && node->left != NULL
1341      && (node->token.opr.idx >= BITSET_WORD_BITS
1342	  || !(dfa->used_bkref_map
1343	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1344    return node->left;
1345
1346  /* Convert the SUBEXP node to the concatenation of an
1347     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1348  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1349  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1350  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1351  tree = create_tree (dfa, op, tree1, CONCAT);
1352  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1353    {
1354      *err = REG_ESPACE;
1355      return NULL;
1356    }
1357
1358  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1359  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1360  return tree;
1361}
1362
1363/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1364   nodes.  Requires a postorder visit.  */
1365static reg_errcode_t
1366calc_first (void *extra, bin_tree_t *node)
1367{
1368  re_dfa_t *dfa = (re_dfa_t *) extra;
1369  if (node->token.type == CONCAT)
1370    {
1371      node->first = node->left->first;
1372      node->node_idx = node->left->node_idx;
1373    }
1374  else
1375    {
1376      node->first = node;
1377      node->node_idx = re_dfa_add_node (dfa, node->token);
1378      if (BE (node->node_idx == REG_MISSING, 0))
1379        return REG_ESPACE;
1380      if (node->token.type == ANCHOR)
1381        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1382    }
1383  return REG_NOERROR;
1384}
1385
1386/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1387static reg_errcode_t
1388calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
1389{
1390  switch (node->token.type)
1391    {
1392    case OP_DUP_ASTERISK:
1393      node->left->next = node;
1394      break;
1395    case CONCAT:
1396      node->left->next = node->right->first;
1397      node->right->next = node->next;
1398      break;
1399    default:
1400      if (node->left)
1401	node->left->next = node->next;
1402      if (node->right)
1403        node->right->next = node->next;
1404      break;
1405    }
1406  return REG_NOERROR;
1407}
1408
1409/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1410static reg_errcode_t
1411link_nfa_nodes (void *extra, bin_tree_t *node)
1412{
1413  re_dfa_t *dfa = (re_dfa_t *) extra;
1414  Idx idx = node->node_idx;
1415  reg_errcode_t err = REG_NOERROR;
1416
1417  switch (node->token.type)
1418    {
1419    case CONCAT:
1420      break;
1421
1422    case END_OF_RE:
1423      assert (node->next == NULL);
1424      break;
1425
1426    case OP_DUP_ASTERISK:
1427    case OP_ALT:
1428      {
1429	Idx left, right;
1430	dfa->has_plural_match = 1;
1431	if (node->left != NULL)
1432	  left = node->left->first->node_idx;
1433	else
1434	  left = node->next->node_idx;
1435	if (node->right != NULL)
1436	  right = node->right->first->node_idx;
1437	else
1438	  right = node->next->node_idx;
1439	assert (REG_VALID_INDEX (left));
1440	assert (REG_VALID_INDEX (right));
1441	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1442      }
1443      break;
1444
1445    case ANCHOR:
1446    case OP_OPEN_SUBEXP:
1447    case OP_CLOSE_SUBEXP:
1448      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1449      break;
1450
1451    case OP_BACK_REF:
1452      dfa->nexts[idx] = node->next->node_idx;
1453      if (node->token.type == OP_BACK_REF)
1454	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1455      break;
1456
1457    default:
1458      assert (!IS_EPSILON_NODE (node->token.type));
1459      dfa->nexts[idx] = node->next->node_idx;
1460      break;
1461    }
1462
1463  return err;
1464}
1465
1466/* Duplicate the epsilon closure of the node ROOT_NODE.
1467   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1468   to their own constraint.  */
1469
1470static reg_errcode_t
1471internal_function
1472duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1473			Idx root_node, unsigned int init_constraint)
1474{
1475  Idx org_node, clone_node;
1476  bool ok;
1477  unsigned int constraint = init_constraint;
1478  for (org_node = top_org_node, clone_node = top_clone_node;;)
1479    {
1480      Idx org_dest, clone_dest;
1481      if (dfa->nodes[org_node].type == OP_BACK_REF)
1482	{
1483	  /* If the back reference epsilon-transit, its destination must
1484	     also have the constraint.  Then duplicate the epsilon closure
1485	     of the destination of the back reference, and store it in
1486	     edests of the back reference.  */
1487	  org_dest = dfa->nexts[org_node];
1488	  re_node_set_empty (dfa->edests + clone_node);
1489	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1490	  if (BE (clone_dest == REG_MISSING, 0))
1491	    return REG_ESPACE;
1492	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1493	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1494	  if (BE (! ok, 0))
1495	    return REG_ESPACE;
1496	}
1497      else if (dfa->edests[org_node].nelem == 0)
1498	{
1499	  /* In case of the node can't epsilon-transit, don't duplicate the
1500	     destination and store the original destination as the
1501	     destination of the node.  */
1502	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1503	  break;
1504	}
1505      else if (dfa->edests[org_node].nelem == 1)
1506	{
1507	  /* In case of the node can epsilon-transit, and it has only one
1508	     destination.  */
1509	  org_dest = dfa->edests[org_node].elems[0];
1510	  re_node_set_empty (dfa->edests + clone_node);
1511	  /* If the node is root_node itself, it means the epsilon closure
1512	     has a loop.  Then tie it to the destination of the root_node.  */
1513	  if (org_node == root_node && clone_node != org_node)
1514	    {
1515	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1516	      if (BE (! ok, 0))
1517	        return REG_ESPACE;
1518	      break;
1519	    }
1520	  /* In case the node has another constraint, append it.  */
1521	  constraint |= dfa->nodes[org_node].constraint;
1522	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1523	  if (BE (clone_dest == REG_MISSING, 0))
1524	    return REG_ESPACE;
1525	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526	  if (BE (! ok, 0))
1527	    return REG_ESPACE;
1528	}
1529      else /* dfa->edests[org_node].nelem == 2 */
1530	{
1531	  /* In case of the node can epsilon-transit, and it has two
1532	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1533	  org_dest = dfa->edests[org_node].elems[0];
1534	  re_node_set_empty (dfa->edests + clone_node);
1535	  /* Search for a duplicated node which satisfies the constraint.  */
1536	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1537	  if (clone_dest == REG_MISSING)
1538	    {
1539	      /* There is no such duplicated node, create a new one.  */
1540	      reg_errcode_t err;
1541	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1542	      if (BE (clone_dest == REG_MISSING, 0))
1543		return REG_ESPACE;
1544	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1545	      if (BE (! ok, 0))
1546		return REG_ESPACE;
1547	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1548					    root_node, constraint);
1549	      if (BE (err != REG_NOERROR, 0))
1550		return err;
1551	    }
1552	  else
1553	    {
1554	      /* There is a duplicated node which satisfies the constraint,
1555		 use it to avoid infinite loop.  */
1556	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557	      if (BE (! ok, 0))
1558		return REG_ESPACE;
1559	    }
1560
1561	  org_dest = dfa->edests[org_node].elems[1];
1562	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1563	  if (BE (clone_dest == REG_MISSING, 0))
1564	    return REG_ESPACE;
1565	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566	  if (BE (! ok, 0))
1567	    return REG_ESPACE;
1568	}
1569      org_node = org_dest;
1570      clone_node = clone_dest;
1571    }
1572  return REG_NOERROR;
1573}
1574
1575/* Search for a node which is duplicated from the node ORG_NODE, and
1576   satisfies the constraint CONSTRAINT.  */
1577
1578static Idx
1579search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1580			unsigned int constraint)
1581{
1582  Idx idx;
1583  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1584    {
1585      if (org_node == dfa->org_indices[idx]
1586	  && constraint == dfa->nodes[idx].constraint)
1587	return idx; /* Found.  */
1588    }
1589  return REG_MISSING; /* Not found.  */
1590}
1591
1592/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1593   Return the index of the new node, or REG_MISSING if insufficient storage is
1594   available.  */
1595
1596static Idx
1597duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1598{
1599  Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1600  if (BE (dup_idx != REG_MISSING, 1))
1601    {
1602      dfa->nodes[dup_idx].constraint = constraint;
1603      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1604      dfa->nodes[dup_idx].duplicated = 1;
1605
1606      /* Store the index of the original node.  */
1607      dfa->org_indices[dup_idx] = org_idx;
1608    }
1609  return dup_idx;
1610}
1611
1612static reg_errcode_t
1613calc_inveclosure (re_dfa_t *dfa)
1614{
1615  Idx src, idx;
1616  bool ok;
1617  for (idx = 0; idx < dfa->nodes_len; ++idx)
1618    re_node_set_init_empty (dfa->inveclosures + idx);
1619
1620  for (src = 0; src < dfa->nodes_len; ++src)
1621    {
1622      Idx *elems = dfa->eclosures[src].elems;
1623      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1624	{
1625	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1626	  if (BE (! ok, 0))
1627	    return REG_ESPACE;
1628	}
1629    }
1630
1631  return REG_NOERROR;
1632}
1633
1634/* Calculate "eclosure" for all the node in DFA.  */
1635
1636static reg_errcode_t
1637calc_eclosure (re_dfa_t *dfa)
1638{
1639  Idx node_idx;
1640  bool incomplete;
1641#ifdef DEBUG
1642  assert (dfa->nodes_len > 0);
1643#endif
1644  incomplete = false;
1645  /* For each nodes, calculate epsilon closure.  */
1646  for (node_idx = 0; ; ++node_idx)
1647    {
1648      reg_errcode_t err;
1649      re_node_set eclosure_elem;
1650      if (node_idx == dfa->nodes_len)
1651	{
1652	  if (!incomplete)
1653	    break;
1654	  incomplete = false;
1655	  node_idx = 0;
1656	}
1657
1658#ifdef DEBUG
1659      assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1660#endif
1661
1662      /* If we have already calculated, skip it.  */
1663      if (dfa->eclosures[node_idx].nelem != 0)
1664	continue;
1665      /* Calculate epsilon closure of `node_idx'.  */
1666      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1667      if (BE (err != REG_NOERROR, 0))
1668	return err;
1669
1670      if (dfa->eclosures[node_idx].nelem == 0)
1671	{
1672	  incomplete = true;
1673	  re_node_set_free (&eclosure_elem);
1674	}
1675    }
1676  return REG_NOERROR;
1677}
1678
1679/* Calculate epsilon closure of NODE.  */
1680
1681static reg_errcode_t
1682calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1683{
1684  reg_errcode_t err;
1685  Idx i;
1686  bool incomplete;
1687  bool ok;
1688  re_node_set eclosure;
1689  incomplete = false;
1690  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1691  if (BE (err != REG_NOERROR, 0))
1692    return err;
1693
1694  /* This indicates that we are calculating this node now.
1695     We reference this value to avoid infinite loop.  */
1696  dfa->eclosures[node].nelem = REG_MISSING;
1697
1698  /* If the current node has constraints, duplicate all nodes
1699     since they must inherit the constraints.  */
1700  if (dfa->nodes[node].constraint
1701      && dfa->edests[node].nelem
1702      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1703    {
1704      err = duplicate_node_closure (dfa, node, node, node,
1705				    dfa->nodes[node].constraint);
1706      if (BE (err != REG_NOERROR, 0))
1707	return err;
1708    }
1709
1710  /* Expand each epsilon destination nodes.  */
1711  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1712    for (i = 0; i < dfa->edests[node].nelem; ++i)
1713      {
1714	re_node_set eclosure_elem;
1715	Idx edest = dfa->edests[node].elems[i];
1716	/* If calculating the epsilon closure of `edest' is in progress,
1717	   return intermediate result.  */
1718	if (dfa->eclosures[edest].nelem == REG_MISSING)
1719	  {
1720	    incomplete = true;
1721	    continue;
1722	  }
1723	/* If we haven't calculated the epsilon closure of `edest' yet,
1724	   calculate now. Otherwise use calculated epsilon closure.  */
1725	if (dfa->eclosures[edest].nelem == 0)
1726	  {
1727	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1728	    if (BE (err != REG_NOERROR, 0))
1729	      return err;
1730	  }
1731	else
1732	  eclosure_elem = dfa->eclosures[edest];
1733	/* Merge the epsilon closure of `edest'.  */
1734	re_node_set_merge (&eclosure, &eclosure_elem);
1735	/* If the epsilon closure of `edest' is incomplete,
1736	   the epsilon closure of this node is also incomplete.  */
1737	if (dfa->eclosures[edest].nelem == 0)
1738	  {
1739	    incomplete = true;
1740	    re_node_set_free (&eclosure_elem);
1741	  }
1742      }
1743
1744  /* Epsilon closures include itself.  */
1745  ok = re_node_set_insert (&eclosure, node);
1746  if (BE (! ok, 0))
1747    return REG_ESPACE;
1748  if (incomplete && !root)
1749    dfa->eclosures[node].nelem = 0;
1750  else
1751    dfa->eclosures[node] = eclosure;
1752  *new_set = eclosure;
1753  return REG_NOERROR;
1754}
1755
1756/* Functions for token which are used in the parser.  */
1757
1758/* Fetch a token from INPUT.
1759   We must not use this function inside bracket expressions.  */
1760
1761static void
1762internal_function
1763fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1764{
1765  re_string_skip_bytes (input, peek_token (result, input, syntax));
1766}
1767
1768/* Peek a token from INPUT, and return the length of the token.
1769   We must not use this function inside bracket expressions.  */
1770
1771static int
1772internal_function
1773peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1774{
1775  unsigned char c;
1776
1777  if (re_string_eoi (input))
1778    {
1779      token->type = END_OF_RE;
1780      return 0;
1781    }
1782
1783  c = re_string_peek_byte (input, 0);
1784  token->opr.c = c;
1785
1786  token->word_char = 0;
1787#ifdef RE_ENABLE_I18N
1788  token->mb_partial = 0;
1789  if (input->mb_cur_max > 1 &&
1790      !re_string_first_byte (input, re_string_cur_idx (input)))
1791    {
1792      token->type = CHARACTER;
1793      token->mb_partial = 1;
1794      return 1;
1795    }
1796#endif
1797  if (c == '\\')
1798    {
1799      unsigned char c2;
1800      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1801	{
1802	  token->type = BACK_SLASH;
1803	  return 1;
1804	}
1805
1806      c2 = re_string_peek_byte_case (input, 1);
1807      token->opr.c = c2;
1808      token->type = CHARACTER;
1809#ifdef RE_ENABLE_I18N
1810      if (input->mb_cur_max > 1)
1811	{
1812	  wint_t wc = re_string_wchar_at (input,
1813					  re_string_cur_idx (input) + 1);
1814	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1815	}
1816      else
1817#endif
1818	token->word_char = IS_WORD_CHAR (c2) != 0;
1819
1820      switch (c2)
1821	{
1822	case '|':
1823	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1824	    token->type = OP_ALT;
1825	  break;
1826	case '1': case '2': case '3': case '4': case '5':
1827	case '6': case '7': case '8': case '9':
1828	  if (!(syntax & RE_NO_BK_REFS))
1829	    {
1830	      token->type = OP_BACK_REF;
1831	      token->opr.idx = c2 - '1';
1832	    }
1833	  break;
1834	case '<':
1835	  if (!(syntax & RE_NO_GNU_OPS))
1836	    {
1837	      token->type = ANCHOR;
1838	      token->opr.ctx_type = WORD_FIRST;
1839	    }
1840	  break;
1841	case '>':
1842	  if (!(syntax & RE_NO_GNU_OPS))
1843	    {
1844	      token->type = ANCHOR;
1845	      token->opr.ctx_type = WORD_LAST;
1846	    }
1847	  break;
1848	case 'b':
1849	  if (!(syntax & RE_NO_GNU_OPS))
1850	    {
1851	      token->type = ANCHOR;
1852	      token->opr.ctx_type = WORD_DELIM;
1853	    }
1854	  break;
1855	case 'B':
1856	  if (!(syntax & RE_NO_GNU_OPS))
1857	    {
1858	      token->type = ANCHOR;
1859	      token->opr.ctx_type = NOT_WORD_DELIM;
1860	    }
1861	  break;
1862	case 'w':
1863	  if (!(syntax & RE_NO_GNU_OPS))
1864	    token->type = OP_WORD;
1865	  break;
1866	case 'W':
1867	  if (!(syntax & RE_NO_GNU_OPS))
1868	    token->type = OP_NOTWORD;
1869	  break;
1870	case 's':
1871	  if (!(syntax & RE_NO_GNU_OPS))
1872	    token->type = OP_SPACE;
1873	  break;
1874	case 'S':
1875	  if (!(syntax & RE_NO_GNU_OPS))
1876	    token->type = OP_NOTSPACE;
1877	  break;
1878	case '`':
1879	  if (!(syntax & RE_NO_GNU_OPS))
1880	    {
1881	      token->type = ANCHOR;
1882	      token->opr.ctx_type = BUF_FIRST;
1883	    }
1884	  break;
1885	case '\'':
1886	  if (!(syntax & RE_NO_GNU_OPS))
1887	    {
1888	      token->type = ANCHOR;
1889	      token->opr.ctx_type = BUF_LAST;
1890	    }
1891	  break;
1892	case '(':
1893	  if (!(syntax & RE_NO_BK_PARENS))
1894	    token->type = OP_OPEN_SUBEXP;
1895	  break;
1896	case ')':
1897	  if (!(syntax & RE_NO_BK_PARENS))
1898	    token->type = OP_CLOSE_SUBEXP;
1899	  break;
1900	case '+':
1901	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902	    token->type = OP_DUP_PLUS;
1903	  break;
1904	case '?':
1905	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1906	    token->type = OP_DUP_QUESTION;
1907	  break;
1908	case '{':
1909	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910	    token->type = OP_OPEN_DUP_NUM;
1911	  break;
1912	case '}':
1913	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1914	    token->type = OP_CLOSE_DUP_NUM;
1915	  break;
1916	default:
1917	  break;
1918	}
1919      return 2;
1920    }
1921
1922  token->type = CHARACTER;
1923#ifdef RE_ENABLE_I18N
1924  if (input->mb_cur_max > 1)
1925    {
1926      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1927      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1928    }
1929  else
1930#endif
1931    token->word_char = IS_WORD_CHAR (token->opr.c);
1932
1933  switch (c)
1934    {
1935    case '\n':
1936      if (syntax & RE_NEWLINE_ALT)
1937	token->type = OP_ALT;
1938      break;
1939    case '|':
1940      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1941	token->type = OP_ALT;
1942      break;
1943    case '*':
1944      token->type = OP_DUP_ASTERISK;
1945      break;
1946    case '+':
1947      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948	token->type = OP_DUP_PLUS;
1949      break;
1950    case '?':
1951      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1952	token->type = OP_DUP_QUESTION;
1953      break;
1954    case '{':
1955      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956	token->type = OP_OPEN_DUP_NUM;
1957      break;
1958    case '}':
1959      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1960	token->type = OP_CLOSE_DUP_NUM;
1961      break;
1962    case '(':
1963      if (syntax & RE_NO_BK_PARENS)
1964	token->type = OP_OPEN_SUBEXP;
1965      break;
1966    case ')':
1967      if (syntax & RE_NO_BK_PARENS)
1968	token->type = OP_CLOSE_SUBEXP;
1969      break;
1970    case '[':
1971      token->type = OP_OPEN_BRACKET;
1972      break;
1973    case '.':
1974      token->type = OP_PERIOD;
1975      break;
1976    case '^':
1977      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1978	  re_string_cur_idx (input) != 0)
1979	{
1980	  char prev = re_string_peek_byte (input, -1);
1981	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1982	    break;
1983	}
1984      token->type = ANCHOR;
1985      token->opr.ctx_type = LINE_FIRST;
1986      break;
1987    case '$':
1988      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1989	  re_string_cur_idx (input) + 1 != re_string_length (input))
1990	{
1991	  re_token_t next;
1992	  re_string_skip_bytes (input, 1);
1993	  peek_token (&next, input, syntax);
1994	  re_string_skip_bytes (input, -1);
1995	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1996	    break;
1997	}
1998      token->type = ANCHOR;
1999      token->opr.ctx_type = LINE_LAST;
2000      break;
2001    default:
2002      break;
2003    }
2004  return 1;
2005}
2006
2007/* Peek a token from INPUT, and return the length of the token.
2008   We must not use this function out of bracket expressions.  */
2009
2010static int
2011internal_function
2012peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2013{
2014  unsigned char c;
2015  if (re_string_eoi (input))
2016    {
2017      token->type = END_OF_RE;
2018      return 0;
2019    }
2020  c = re_string_peek_byte (input, 0);
2021  token->opr.c = c;
2022
2023#ifdef RE_ENABLE_I18N
2024  if (input->mb_cur_max > 1 &&
2025      !re_string_first_byte (input, re_string_cur_idx (input)))
2026    {
2027      token->type = CHARACTER;
2028      return 1;
2029    }
2030#endif /* RE_ENABLE_I18N */
2031
2032  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2033      && re_string_cur_idx (input) + 1 < re_string_length (input))
2034    {
2035      /* In this case, '\' escape a character.  */
2036      unsigned char c2;
2037      re_string_skip_bytes (input, 1);
2038      c2 = re_string_peek_byte (input, 0);
2039      token->opr.c = c2;
2040      token->type = CHARACTER;
2041      return 1;
2042    }
2043  if (c == '[') /* '[' is a special char in a bracket exps.  */
2044    {
2045      unsigned char c2;
2046      int token_len;
2047      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2048	c2 = re_string_peek_byte (input, 1);
2049      else
2050	c2 = 0;
2051      token->opr.c = c2;
2052      token_len = 2;
2053      switch (c2)
2054	{
2055	case '.':
2056	  token->type = OP_OPEN_COLL_ELEM;
2057	  break;
2058	case '=':
2059	  token->type = OP_OPEN_EQUIV_CLASS;
2060	  break;
2061	case ':':
2062	  if (syntax & RE_CHAR_CLASSES)
2063	    {
2064	      token->type = OP_OPEN_CHAR_CLASS;
2065	      break;
2066	    }
2067	  /* else fall through.  */
2068	default:
2069	  token->type = CHARACTER;
2070	  token->opr.c = c;
2071	  token_len = 1;
2072	  break;
2073	}
2074      return token_len;
2075    }
2076  switch (c)
2077    {
2078    case '-':
2079      token->type = OP_CHARSET_RANGE;
2080      break;
2081    case ']':
2082      token->type = OP_CLOSE_BRACKET;
2083      break;
2084    case '^':
2085      token->type = OP_NON_MATCH_LIST;
2086      break;
2087    default:
2088      token->type = CHARACTER;
2089    }
2090  return 1;
2091}
2092
2093/* Functions for parser.  */
2094
2095/* Entry point of the parser.
2096   Parse the regular expression REGEXP and return the structure tree.
2097   If an error is occured, ERR is set by error code, and return NULL.
2098   This function build the following tree, from regular expression <reg_exp>:
2099	   CAT
2100	   / \
2101	  /   \
2102   <reg_exp>  EOR
2103
2104   CAT means concatenation.
2105   EOR means end of regular expression.  */
2106
2107static bin_tree_t *
2108parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2109       reg_errcode_t *err)
2110{
2111  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2112  bin_tree_t *tree, *eor, *root;
2113  re_token_t current_token;
2114  dfa->syntax = syntax;
2115  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2116  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2117  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2118    return NULL;
2119  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2120  if (tree != NULL)
2121    root = create_tree (dfa, tree, eor, CONCAT);
2122  else
2123    root = eor;
2124  if (BE (eor == NULL || root == NULL, 0))
2125    {
2126      *err = REG_ESPACE;
2127      return NULL;
2128    }
2129  return root;
2130}
2131
2132/* This function build the following tree, from regular expression
2133   <branch1>|<branch2>:
2134	   ALT
2135	   / \
2136	  /   \
2137   <branch1> <branch2>
2138
2139   ALT means alternative, which represents the operator `|'.  */
2140
2141static bin_tree_t *
2142parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2143	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2144{
2145  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2146  bin_tree_t *tree, *branch = NULL;
2147  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2148  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149    return NULL;
2150
2151  while (token->type == OP_ALT)
2152    {
2153      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154      if (token->type != OP_ALT && token->type != END_OF_RE
2155	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2156	{
2157	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2158	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2159	    return NULL;
2160	}
2161      else
2162	branch = NULL;
2163      tree = create_tree (dfa, tree, branch, OP_ALT);
2164      if (BE (tree == NULL, 0))
2165	{
2166	  *err = REG_ESPACE;
2167	  return NULL;
2168	}
2169    }
2170  return tree;
2171}
2172
2173/* This function build the following tree, from regular expression
2174   <exp1><exp2>:
2175	CAT
2176	/ \
2177       /   \
2178   <exp1> <exp2>
2179
2180   CAT means concatenation.  */
2181
2182static bin_tree_t *
2183parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2184	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2185{
2186  bin_tree_t *tree, *expr;
2187  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2188  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2189  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190    return NULL;
2191
2192  while (token->type != OP_ALT && token->type != END_OF_RE
2193	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194    {
2195      expr = parse_expression (regexp, preg, token, syntax, nest, err);
2196      if (BE (*err != REG_NOERROR && expr == NULL, 0))
2197	{
2198	  return NULL;
2199	}
2200      if (tree != NULL && expr != NULL)
2201	{
2202	  tree = create_tree (dfa, tree, expr, CONCAT);
2203	  if (tree == NULL)
2204	    {
2205	      *err = REG_ESPACE;
2206	      return NULL;
2207	    }
2208	}
2209      else if (tree == NULL)
2210	tree = expr;
2211      /* Otherwise expr == NULL, we don't need to create new tree.  */
2212    }
2213  return tree;
2214}
2215
2216/* This function build the following tree, from regular expression a*:
2217	 *
2218	 |
2219	 a
2220*/
2221
2222static bin_tree_t *
2223parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225{
2226  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227  bin_tree_t *tree;
2228  switch (token->type)
2229    {
2230    case CHARACTER:
2231      tree = create_token_tree (dfa, NULL, NULL, token);
2232      if (BE (tree == NULL, 0))
2233	{
2234	  *err = REG_ESPACE;
2235	  return NULL;
2236	}
2237#ifdef RE_ENABLE_I18N
2238      if (dfa->mb_cur_max > 1)
2239	{
2240	  while (!re_string_eoi (regexp)
2241		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242	    {
2243	      bin_tree_t *mbc_remain;
2244	      fetch_token (token, regexp, syntax);
2245	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2246	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2247	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2248		{
2249		  *err = REG_ESPACE;
2250		  return NULL;
2251		}
2252	    }
2253	}
2254#endif
2255      break;
2256    case OP_OPEN_SUBEXP:
2257      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2258      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259	return NULL;
2260      break;
2261    case OP_OPEN_BRACKET:
2262      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2264	return NULL;
2265      break;
2266    case OP_BACK_REF:
2267      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2268	{
2269	  *err = REG_ESUBREG;
2270	  return NULL;
2271	}
2272      dfa->used_bkref_map |= 1 << token->opr.idx;
2273      tree = create_token_tree (dfa, NULL, NULL, token);
2274      if (BE (tree == NULL, 0))
2275	{
2276	  *err = REG_ESPACE;
2277	  return NULL;
2278	}
2279      ++dfa->nbackref;
2280      dfa->has_mb_node = 1;
2281      break;
2282    case OP_OPEN_DUP_NUM:
2283      if (syntax & RE_CONTEXT_INVALID_DUP)
2284	{
2285	  *err = REG_BADRPT;
2286	  return NULL;
2287	}
2288      /* FALLTHROUGH */
2289    case OP_DUP_ASTERISK:
2290    case OP_DUP_PLUS:
2291    case OP_DUP_QUESTION:
2292      if (syntax & RE_CONTEXT_INVALID_OPS)
2293	{
2294	  *err = REG_BADRPT;
2295	  return NULL;
2296	}
2297      else if (syntax & RE_CONTEXT_INDEP_OPS)
2298	{
2299	  fetch_token (token, regexp, syntax);
2300	  return parse_expression (regexp, preg, token, syntax, nest, err);
2301	}
2302      /* else fall through  */
2303    case OP_CLOSE_SUBEXP:
2304      if ((token->type == OP_CLOSE_SUBEXP) &&
2305	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2306	{
2307	  *err = REG_ERPAREN;
2308	  return NULL;
2309	}
2310      /* else fall through  */
2311    case OP_CLOSE_DUP_NUM:
2312      /* We treat it as a normal character.  */
2313
2314      /* Then we can these characters as normal characters.  */
2315      token->type = CHARACTER;
2316      /* mb_partial and word_char bits should be initialized already
2317	 by peek_token.  */
2318      tree = create_token_tree (dfa, NULL, NULL, token);
2319      if (BE (tree == NULL, 0))
2320	{
2321	  *err = REG_ESPACE;
2322	  return NULL;
2323	}
2324      break;
2325    case ANCHOR:
2326      if ((token->opr.ctx_type
2327	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2328	  && dfa->word_ops_used == 0)
2329	init_word_char (dfa);
2330      if (token->opr.ctx_type == WORD_DELIM
2331          || token->opr.ctx_type == NOT_WORD_DELIM)
2332	{
2333	  bin_tree_t *tree_first, *tree_last;
2334	  if (token->opr.ctx_type == WORD_DELIM)
2335	    {
2336	      token->opr.ctx_type = WORD_FIRST;
2337	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2338	      token->opr.ctx_type = WORD_LAST;
2339            }
2340          else
2341            {
2342	      token->opr.ctx_type = INSIDE_WORD;
2343	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2344	      token->opr.ctx_type = INSIDE_NOTWORD;
2345            }
2346	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2347	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2348	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2349	    {
2350	      *err = REG_ESPACE;
2351	      return NULL;
2352	    }
2353	}
2354      else
2355	{
2356	  tree = create_token_tree (dfa, NULL, NULL, token);
2357	  if (BE (tree == NULL, 0))
2358	    {
2359	      *err = REG_ESPACE;
2360	      return NULL;
2361	    }
2362	}
2363      /* We must return here, since ANCHORs can't be followed
2364	 by repetition operators.
2365	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2367      fetch_token (token, regexp, syntax);
2368      return tree;
2369    case OP_PERIOD:
2370      tree = create_token_tree (dfa, NULL, NULL, token);
2371      if (BE (tree == NULL, 0))
2372	{
2373	  *err = REG_ESPACE;
2374	  return NULL;
2375	}
2376      if (dfa->mb_cur_max > 1)
2377	dfa->has_mb_node = 1;
2378      break;
2379    case OP_WORD:
2380    case OP_NOTWORD:
2381      tree = build_charclass_op (dfa, regexp->trans,
2382				 (const unsigned char *) "alnum",
2383				 (const unsigned char *) "_",
2384				 token->type == OP_NOTWORD, err);
2385      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386	return NULL;
2387      break;
2388    case OP_SPACE:
2389    case OP_NOTSPACE:
2390      tree = build_charclass_op (dfa, regexp->trans,
2391				 (const unsigned char *) "space",
2392				 (const unsigned char *) "",
2393				 token->type == OP_NOTSPACE, err);
2394      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395	return NULL;
2396      break;
2397    case OP_ALT:
2398    case END_OF_RE:
2399      return NULL;
2400    case BACK_SLASH:
2401      *err = REG_EESCAPE;
2402      return NULL;
2403    default:
2404      /* Must not happen?  */
2405#ifdef DEBUG
2406      assert (0);
2407#endif
2408      return NULL;
2409    }
2410  fetch_token (token, regexp, syntax);
2411
2412  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2413	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414    {
2415      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2416      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2417	return NULL;
2418      /* In BRE consecutive duplications are not allowed.  */
2419      if ((syntax & RE_CONTEXT_INVALID_DUP)
2420	  && (token->type == OP_DUP_ASTERISK
2421	      || token->type == OP_OPEN_DUP_NUM))
2422	{
2423	  *err = REG_BADRPT;
2424	  return NULL;
2425	}
2426    }
2427
2428  return tree;
2429}
2430
2431/* This function build the following tree, from regular expression
2432   (<reg_exp>):
2433	 SUBEXP
2434	    |
2435	<reg_exp>
2436*/
2437
2438static bin_tree_t *
2439parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2440	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2441{
2442  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2443  bin_tree_t *tree;
2444  size_t cur_nsub;
2445  cur_nsub = preg->re_nsub++;
2446
2447  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2448
2449  /* The subexpression may be a null string.  */
2450  if (token->type == OP_CLOSE_SUBEXP)
2451    tree = NULL;
2452  else
2453    {
2454      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2455      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2456        *err = REG_EPAREN;
2457      if (BE (*err != REG_NOERROR, 0))
2458	return NULL;
2459    }
2460
2461  if (cur_nsub <= '9' - '1')
2462    dfa->completed_bkref_map |= 1 << cur_nsub;
2463
2464  tree = create_tree (dfa, tree, NULL, SUBEXP);
2465  if (BE (tree == NULL, 0))
2466    {
2467      *err = REG_ESPACE;
2468      return NULL;
2469    }
2470  tree->token.opr.idx = cur_nsub;
2471  return tree;
2472}
2473
2474/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2475
2476static bin_tree_t *
2477parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2478	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2479{
2480  bin_tree_t *tree = NULL, *old_tree = NULL;
2481  Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2482  re_token_t start_token = *token;
2483
2484  if (token->type == OP_OPEN_DUP_NUM)
2485    {
2486      end = 0;
2487      start = fetch_number (regexp, token, syntax);
2488      if (start == REG_MISSING)
2489	{
2490	  if (token->type == CHARACTER && token->opr.c == ',')
2491	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2492	  else
2493	    {
2494	      *err = REG_BADBR; /* <re>{} is invalid.  */
2495	      return NULL;
2496	    }
2497	}
2498      if (BE (start != REG_ERROR, 1))
2499	{
2500	  /* We treat "{n}" as "{n,n}".  */
2501	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2502		 : ((token->type == CHARACTER && token->opr.c == ',')
2503		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2504	}
2505      if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2506	{
2507	  /* Invalid sequence.  */
2508	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2509	    {
2510	      if (token->type == END_OF_RE)
2511		*err = REG_EBRACE;
2512	      else
2513		*err = REG_BADBR;
2514
2515	      return NULL;
2516	    }
2517
2518	  /* If the syntax bit is set, rollback.  */
2519	  re_string_set_index (regexp, start_idx);
2520	  *token = start_token;
2521	  token->type = CHARACTER;
2522	  /* mb_partial and word_char bits should be already initialized by
2523	     peek_token.  */
2524	  return elem;
2525	}
2526
2527      if (BE ((end != REG_MISSING && start > end)
2528	      || token->type != OP_CLOSE_DUP_NUM, 0))
2529	{
2530	  /* First number greater than second.  */
2531	  *err = REG_BADBR;
2532	  return NULL;
2533	}
2534    }
2535  else
2536    {
2537      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2538      end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2539    }
2540
2541  fetch_token (token, regexp, syntax);
2542
2543  if (BE (elem == NULL, 0))
2544    return NULL;
2545  if (BE (start == 0 && end == 0, 0))
2546    {
2547      postorder (elem, free_tree, NULL);
2548      return NULL;
2549    }
2550
2551  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2552  if (BE (start > 0, 0))
2553    {
2554      tree = elem;
2555      for (i = 2; i <= start; ++i)
2556	{
2557	  elem = duplicate_tree (elem, dfa);
2558	  tree = create_tree (dfa, tree, elem, CONCAT);
2559	  if (BE (elem == NULL || tree == NULL, 0))
2560	    goto parse_dup_op_espace;
2561	}
2562
2563      if (start == end)
2564	return tree;
2565
2566      /* Duplicate ELEM before it is marked optional.  */
2567      elem = duplicate_tree (elem, dfa);
2568      old_tree = tree;
2569    }
2570  else
2571    old_tree = NULL;
2572
2573  if (elem->token.type == SUBEXP)
2574    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2575
2576  tree = create_tree (dfa, elem, NULL,
2577		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2578  if (BE (tree == NULL, 0))
2579    goto parse_dup_op_espace;
2580
2581  /* This loop is actually executed only when end != REG_MISSING,
2582     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2583     already created the start+1-th copy.  */
2584  verify (! TYPE_SIGNED (Idx));
2585  if (end != REG_MISSING)
2586    for (i = start + 2; i <= end; ++i)
2587      {
2588	elem = duplicate_tree (elem, dfa);
2589	tree = create_tree (dfa, tree, elem, CONCAT);
2590	if (BE (elem == NULL || tree == NULL, 0))
2591	  goto parse_dup_op_espace;
2592
2593	tree = create_tree (dfa, tree, NULL, OP_ALT);
2594	if (BE (tree == NULL, 0))
2595	  goto parse_dup_op_espace;
2596      }
2597
2598  if (old_tree)
2599    tree = create_tree (dfa, old_tree, tree, CONCAT);
2600
2601  return tree;
2602
2603 parse_dup_op_espace:
2604  *err = REG_ESPACE;
2605  return NULL;
2606}
2607
2608/* Size of the names for collating symbol/equivalence_class/character_class.
2609   I'm not sure, but maybe enough.  */
2610#define BRACKET_NAME_BUF_SIZE 32
2611
2612#ifndef _LIBC
2613  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2614     Build the range expression which starts from START_ELEM, and ends
2615     at END_ELEM.  The result are written to MBCSET and SBCSET.
2616     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2617     mbcset->range_ends, is a pointer argument sinse we may
2618     update it.  */
2619
2620static reg_errcode_t
2621internal_function
2622# ifdef RE_ENABLE_I18N
2623build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2624		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2625# else /* not RE_ENABLE_I18N */
2626build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2627		 bracket_elem_t *end_elem)
2628# endif /* not RE_ENABLE_I18N */
2629{
2630  unsigned int start_ch, end_ch;
2631  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2632  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2633	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2634	  0))
2635    return REG_ERANGE;
2636
2637  /* We can handle no multi character collating elements without libc
2638     support.  */
2639  if (BE ((start_elem->type == COLL_SYM
2640	   && strlen ((char *) start_elem->opr.name) > 1)
2641	  || (end_elem->type == COLL_SYM
2642	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2643    return REG_ECOLLATE;
2644
2645# ifdef RE_ENABLE_I18N
2646  {
2647    wchar_t wc;
2648    wint_t start_wc;
2649    wint_t end_wc;
2650    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2651
2652    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2653		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2654		   : 0));
2655    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2656	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2657		 : 0));
2658    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2659		? __btowc (start_ch) : start_elem->opr.wch);
2660    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2661	      ? __btowc (end_ch) : end_elem->opr.wch);
2662    if (start_wc == WEOF || end_wc == WEOF)
2663      return REG_ECOLLATE;
2664    cmp_buf[0] = start_wc;
2665    cmp_buf[4] = end_wc;
2666    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2667      return REG_ERANGE;
2668
2669    /* Got valid collation sequence values, add them as a new entry.
2670       However, for !_LIBC we have no collation elements: if the
2671       character set is single byte, the single byte character set
2672       that we build below suffices.  parse_bracket_exp passes
2673       no MBCSET if dfa->mb_cur_max == 1.  */
2674    if (mbcset)
2675      {
2676        /* Check the space of the arrays.  */
2677        if (BE (*range_alloc == mbcset->nranges, 0))
2678          {
2679	    /* There is not enough space, need realloc.  */
2680	    wchar_t *new_array_start, *new_array_end;
2681	    Idx new_nranges;
2682
2683	    /* +1 in case of mbcset->nranges is 0.  */
2684	    new_nranges = 2 * mbcset->nranges + 1;
2685	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2686	       are NULL if *range_alloc == 0.  */
2687	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2688				          new_nranges);
2689	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2690				        new_nranges);
2691
2692	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2693	      return REG_ESPACE;
2694
2695	    mbcset->range_starts = new_array_start;
2696	    mbcset->range_ends = new_array_end;
2697	    *range_alloc = new_nranges;
2698          }
2699
2700        mbcset->range_starts[mbcset->nranges] = start_wc;
2701        mbcset->range_ends[mbcset->nranges++] = end_wc;
2702      }
2703
2704    /* Build the table for single byte characters.  */
2705    for (wc = 0; wc < SBC_MAX; ++wc)
2706      {
2707	cmp_buf[2] = wc;
2708	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2709	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2710	  bitset_set (sbcset, wc);
2711      }
2712  }
2713# else /* not RE_ENABLE_I18N */
2714  {
2715    unsigned int ch;
2716    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2717		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2718		   : 0));
2719    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2720	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2721		 : 0));
2722    if (start_ch > end_ch)
2723      return REG_ERANGE;
2724    /* Build the table for single byte characters.  */
2725    for (ch = 0; ch < SBC_MAX; ++ch)
2726      if (start_ch <= ch  && ch <= end_ch)
2727	bitset_set (sbcset, ch);
2728  }
2729# endif /* not RE_ENABLE_I18N */
2730  return REG_NOERROR;
2731}
2732#endif /* not _LIBC */
2733
2734#ifndef _LIBC
2735/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2736   Build the collating element which is represented by NAME.
2737   The result are written to MBCSET and SBCSET.
2738   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2739   pointer argument since we may update it.  */
2740
2741static reg_errcode_t
2742internal_function
2743build_collating_symbol (bitset_t sbcset,
2744# ifdef RE_ENABLE_I18N
2745			re_charset_t *mbcset _UNUSED_PARAMETER_,
2746			Idx *coll_sym_alloc _UNUSED_PARAMETER_,
2747# endif
2748			const unsigned char *name)
2749{
2750  size_t name_len = strlen ((const char *) name);
2751  if (BE (name_len != 1, 0))
2752    return REG_ECOLLATE;
2753  else
2754    {
2755      bitset_set (sbcset, name[0]);
2756      return REG_NOERROR;
2757    }
2758}
2759#endif /* not _LIBC */
2760
2761/* This function parse bracket expression like "[abc]", "[a-c]",
2762   "[[.a-a.]]" etc.  */
2763
2764static bin_tree_t *
2765parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2766		   reg_syntax_t syntax, reg_errcode_t *err)
2767{
2768#ifdef _LIBC
2769  const unsigned char *collseqmb;
2770  const char *collseqwc;
2771  uint32_t nrules;
2772  int32_t table_size;
2773  const int32_t *symb_table;
2774  const unsigned char *extra;
2775
2776  /* Local function for parse_bracket_exp used in _LIBC environement.
2777     Seek the collating symbol entry correspondings to NAME.
2778     Return the index of the symbol in the SYMB_TABLE.  */
2779
2780  auto inline int32_t
2781  __attribute ((always_inline))
2782  seek_collating_symbol_entry (name, name_len)
2783	 const unsigned char *name;
2784	 size_t name_len;
2785    {
2786      int32_t hash = elem_hash ((const char *) name, name_len);
2787      int32_t elem = hash % table_size;
2788      if (symb_table[2 * elem] != 0)
2789	{
2790	  int32_t second = hash % (table_size - 2) + 1;
2791
2792	  do
2793	    {
2794	      /* First compare the hashing value.  */
2795	      if (symb_table[2 * elem] == hash
2796		  /* Compare the length of the name.  */
2797		  && name_len == extra[symb_table[2 * elem + 1]]
2798		  /* Compare the name.  */
2799		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2800			     name_len) == 0)
2801		{
2802		  /* Yep, this is the entry.  */
2803		  break;
2804		}
2805
2806	      /* Next entry.  */
2807	      elem += second;
2808	    }
2809	  while (symb_table[2 * elem] != 0);
2810	}
2811      return elem;
2812    }
2813
2814  /* Local function for parse_bracket_exp used in _LIBC environment.
2815     Look up the collation sequence value of BR_ELEM.
2816     Return the value if succeeded, UINT_MAX otherwise.  */
2817
2818  auto inline unsigned int
2819  __attribute ((always_inline))
2820  lookup_collation_sequence_value (br_elem)
2821	 bracket_elem_t *br_elem;
2822    {
2823      if (br_elem->type == SB_CHAR)
2824	{
2825	  /*
2826	  if (MB_CUR_MAX == 1)
2827	  */
2828	  if (nrules == 0)
2829	    return collseqmb[br_elem->opr.ch];
2830	  else
2831	    {
2832	      wint_t wc = __btowc (br_elem->opr.ch);
2833	      return __collseq_table_lookup (collseqwc, wc);
2834	    }
2835	}
2836      else if (br_elem->type == MB_CHAR)
2837	{
2838	  if (nrules != 0)
2839	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2840	}
2841      else if (br_elem->type == COLL_SYM)
2842	{
2843	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2844	  if (nrules != 0)
2845	    {
2846	      int32_t elem, idx;
2847	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2848						  sym_name_len);
2849	      if (symb_table[2 * elem] != 0)
2850		{
2851		  /* We found the entry.  */
2852		  idx = symb_table[2 * elem + 1];
2853		  /* Skip the name of collating element name.  */
2854		  idx += 1 + extra[idx];
2855		  /* Skip the byte sequence of the collating element.  */
2856		  idx += 1 + extra[idx];
2857		  /* Adjust for the alignment.  */
2858		  idx = (idx + 3) & ~3;
2859		  /* Skip the multibyte collation sequence value.  */
2860		  idx += sizeof (unsigned int);
2861		  /* Skip the wide char sequence of the collating element.  */
2862		  idx += sizeof (unsigned int) *
2863		    (1 + *(unsigned int *) (extra + idx));
2864		  /* Return the collation sequence value.  */
2865		  return *(unsigned int *) (extra + idx);
2866		}
2867	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2868		{
2869		  /* No valid character.  Match it as a single byte
2870		     character.  */
2871		  return collseqmb[br_elem->opr.name[0]];
2872		}
2873	    }
2874	  else if (sym_name_len == 1)
2875	    return collseqmb[br_elem->opr.name[0]];
2876	}
2877      return UINT_MAX;
2878    }
2879
2880  /* Local function for parse_bracket_exp used in _LIBC environement.
2881     Build the range expression which starts from START_ELEM, and ends
2882     at END_ELEM.  The result are written to MBCSET and SBCSET.
2883     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2884     mbcset->range_ends, is a pointer argument sinse we may
2885     update it.  */
2886
2887  auto inline reg_errcode_t
2888  __attribute ((always_inline))
2889  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2890	 re_charset_t *mbcset;
2891	 Idx *range_alloc;
2892	 bitset_t sbcset;
2893	 bracket_elem_t *start_elem, *end_elem;
2894    {
2895      unsigned int ch;
2896      uint32_t start_collseq;
2897      uint32_t end_collseq;
2898
2899      /* Equivalence Classes and Character Classes can't be a range
2900	 start/end.  */
2901      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2902	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2903	      0))
2904	return REG_ERANGE;
2905
2906      start_collseq = lookup_collation_sequence_value (start_elem);
2907      end_collseq = lookup_collation_sequence_value (end_elem);
2908      /* Check start/end collation sequence values.  */
2909      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2910	return REG_ECOLLATE;
2911      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2912	return REG_ERANGE;
2913
2914      /* Got valid collation sequence values, add them as a new entry.
2915	 However, if we have no collation elements, and the character set
2916	 is single byte, the single byte character set that we
2917	 build below suffices. */
2918      if (nrules > 0 || dfa->mb_cur_max > 1)
2919	{
2920          /* Check the space of the arrays.  */
2921          if (BE (*range_alloc == mbcset->nranges, 0))
2922	    {
2923	      /* There is not enough space, need realloc.  */
2924	      uint32_t *new_array_start;
2925	      uint32_t *new_array_end;
2926	      Idx new_nranges;
2927
2928	      /* +1 in case of mbcset->nranges is 0.  */
2929	      new_nranges = 2 * mbcset->nranges + 1;
2930	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2931					    new_nranges);
2932	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2933				          new_nranges);
2934
2935	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2936	        return REG_ESPACE;
2937
2938	      mbcset->range_starts = new_array_start;
2939	      mbcset->range_ends = new_array_end;
2940	      *range_alloc = new_nranges;
2941	    }
2942
2943          mbcset->range_starts[mbcset->nranges] = start_collseq;
2944          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2945	}
2946
2947      /* Build the table for single byte characters.  */
2948      for (ch = 0; ch < SBC_MAX; ch++)
2949	{
2950	  uint32_t ch_collseq;
2951	  /*
2952	  if (MB_CUR_MAX == 1)
2953	  */
2954	  if (nrules == 0)
2955	    ch_collseq = collseqmb[ch];
2956	  else
2957	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2958	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2959	    bitset_set (sbcset, ch);
2960	}
2961      return REG_NOERROR;
2962    }
2963
2964  /* Local function for parse_bracket_exp used in _LIBC environement.
2965     Build the collating element which is represented by NAME.
2966     The result are written to MBCSET and SBCSET.
2967     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2968     pointer argument sinse we may update it.  */
2969
2970  auto inline reg_errcode_t
2971  __attribute ((always_inline))
2972  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2973	 re_charset_t *mbcset;
2974	 Idx *coll_sym_alloc;
2975	 bitset_t sbcset;
2976	 const unsigned char *name;
2977    {
2978      int32_t elem, idx;
2979      size_t name_len = strlen ((const char *) name);
2980      if (nrules != 0)
2981	{
2982	  elem = seek_collating_symbol_entry (name, name_len);
2983	  if (symb_table[2 * elem] != 0)
2984	    {
2985	      /* We found the entry.  */
2986	      idx = symb_table[2 * elem + 1];
2987	      /* Skip the name of collating element name.  */
2988	      idx += 1 + extra[idx];
2989	    }
2990	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2991	    {
2992	      /* No valid character, treat it as a normal
2993		 character.  */
2994	      bitset_set (sbcset, name[0]);
2995	      return REG_NOERROR;
2996	    }
2997	  else
2998	    return REG_ECOLLATE;
2999
3000	  /* Got valid collation sequence, add it as a new entry.  */
3001	  /* Check the space of the arrays.  */
3002	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3003	    {
3004	      /* Not enough, realloc it.  */
3005	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3006	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3007	      /* Use realloc since mbcset->coll_syms is NULL
3008		 if *alloc == 0.  */
3009	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3010						   new_coll_sym_alloc);
3011	      if (BE (new_coll_syms == NULL, 0))
3012		return REG_ESPACE;
3013	      mbcset->coll_syms = new_coll_syms;
3014	      *coll_sym_alloc = new_coll_sym_alloc;
3015	    }
3016	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3017	  return REG_NOERROR;
3018	}
3019      else
3020	{
3021	  if (BE (name_len != 1, 0))
3022	    return REG_ECOLLATE;
3023	  else
3024	    {
3025	      bitset_set (sbcset, name[0]);
3026	      return REG_NOERROR;
3027	    }
3028	}
3029    }
3030#endif
3031
3032  re_token_t br_token;
3033  re_bitset_ptr_t sbcset;
3034#ifdef RE_ENABLE_I18N
3035  re_charset_t *mbcset;
3036  Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3037  Idx equiv_class_alloc = 0, char_class_alloc = 0;
3038#endif /* not RE_ENABLE_I18N */
3039  bool non_match = false;
3040  bin_tree_t *work_tree;
3041  int token_len;
3042  bool first_round = true;
3043#ifdef _LIBC
3044  collseqmb = (const unsigned char *)
3045    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3046  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3047  if (nrules)
3048    {
3049      /*
3050      if (MB_CUR_MAX > 1)
3051      */
3052      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3053      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3054      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3055						  _NL_COLLATE_SYMB_TABLEMB);
3056      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3057						   _NL_COLLATE_SYMB_EXTRAMB);
3058    }
3059#endif
3060  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3061#ifdef RE_ENABLE_I18N
3062  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3063#endif /* RE_ENABLE_I18N */
3064#ifdef RE_ENABLE_I18N
3065  if (BE (sbcset == NULL || mbcset == NULL, 0))
3066#else
3067  if (BE (sbcset == NULL, 0))
3068#endif /* RE_ENABLE_I18N */
3069    {
3070      *err = REG_ESPACE;
3071      return NULL;
3072    }
3073
3074  token_len = peek_token_bracket (token, regexp, syntax);
3075  if (BE (token->type == END_OF_RE, 0))
3076    {
3077      *err = REG_BADPAT;
3078      goto parse_bracket_exp_free_return;
3079    }
3080  if (token->type == OP_NON_MATCH_LIST)
3081    {
3082#ifdef RE_ENABLE_I18N
3083      mbcset->non_match = 1;
3084#endif /* not RE_ENABLE_I18N */
3085      non_match = true;
3086      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3087	bitset_set (sbcset, '\n');
3088      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3089      token_len = peek_token_bracket (token, regexp, syntax);
3090      if (BE (token->type == END_OF_RE, 0))
3091	{
3092	  *err = REG_BADPAT;
3093	  goto parse_bracket_exp_free_return;
3094	}
3095    }
3096
3097  /* We treat the first ']' as a normal character.  */
3098  if (token->type == OP_CLOSE_BRACKET)
3099    token->type = CHARACTER;
3100
3101  while (1)
3102    {
3103      bracket_elem_t start_elem, end_elem;
3104      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3105      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3106      reg_errcode_t ret;
3107      int token_len2 = 0;
3108      bool is_range_exp = false;
3109      re_token_t token2;
3110
3111      start_elem.opr.name = start_name_buf;
3112      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3113				   syntax, first_round);
3114      if (BE (ret != REG_NOERROR, 0))
3115	{
3116	  *err = ret;
3117	  goto parse_bracket_exp_free_return;
3118	}
3119      first_round = false;
3120
3121      /* Get information about the next token.  We need it in any case.  */
3122      token_len = peek_token_bracket (token, regexp, syntax);
3123
3124      /* Do not check for ranges if we know they are not allowed.  */
3125      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3126	{
3127	  if (BE (token->type == END_OF_RE, 0))
3128	    {
3129	      *err = REG_EBRACK;
3130	      goto parse_bracket_exp_free_return;
3131	    }
3132	  if (token->type == OP_CHARSET_RANGE)
3133	    {
3134	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3135	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3136	      if (BE (token2.type == END_OF_RE, 0))
3137		{
3138		  *err = REG_EBRACK;
3139		  goto parse_bracket_exp_free_return;
3140		}
3141	      if (token2.type == OP_CLOSE_BRACKET)
3142		{
3143		  /* We treat the last '-' as a normal character.  */
3144		  re_string_skip_bytes (regexp, -token_len);
3145		  token->type = CHARACTER;
3146		}
3147	      else
3148		is_range_exp = true;
3149	    }
3150	}
3151
3152      if (is_range_exp == true)
3153	{
3154	  end_elem.opr.name = end_name_buf;
3155	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3156				       dfa, syntax, true);
3157	  if (BE (ret != REG_NOERROR, 0))
3158	    {
3159	      *err = ret;
3160	      goto parse_bracket_exp_free_return;
3161	    }
3162
3163	  token_len = peek_token_bracket (token, regexp, syntax);
3164
3165#ifdef _LIBC
3166	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3167				  &start_elem, &end_elem);
3168#else
3169# ifdef RE_ENABLE_I18N
3170	  *err = build_range_exp (sbcset,
3171				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3172				  &range_alloc, &start_elem, &end_elem);
3173# else
3174	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3175# endif
3176#endif /* RE_ENABLE_I18N */
3177	  if (BE (*err != REG_NOERROR, 0))
3178	    goto parse_bracket_exp_free_return;
3179	}
3180      else
3181	{
3182	  switch (start_elem.type)
3183	    {
3184	    case SB_CHAR:
3185	      bitset_set (sbcset, start_elem.opr.ch);
3186	      break;
3187#ifdef RE_ENABLE_I18N
3188	    case MB_CHAR:
3189	      /* Check whether the array has enough space.  */
3190	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3191		{
3192		  wchar_t *new_mbchars;
3193		  /* Not enough, realloc it.  */
3194		  /* +1 in case of mbcset->nmbchars is 0.  */
3195		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3196		  /* Use realloc since array is NULL if *alloc == 0.  */
3197		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3198					    mbchar_alloc);
3199		  if (BE (new_mbchars == NULL, 0))
3200		    goto parse_bracket_exp_espace;
3201		  mbcset->mbchars = new_mbchars;
3202		}
3203	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3204	      break;
3205#endif /* RE_ENABLE_I18N */
3206	    case EQUIV_CLASS:
3207	      *err = build_equiv_class (sbcset,
3208#ifdef RE_ENABLE_I18N
3209					mbcset, &equiv_class_alloc,
3210#endif /* RE_ENABLE_I18N */
3211					start_elem.opr.name);
3212	      if (BE (*err != REG_NOERROR, 0))
3213		goto parse_bracket_exp_free_return;
3214	      break;
3215	    case COLL_SYM:
3216	      *err = build_collating_symbol (sbcset,
3217#ifdef RE_ENABLE_I18N
3218					     mbcset, &coll_sym_alloc,
3219#endif /* RE_ENABLE_I18N */
3220					     start_elem.opr.name);
3221	      if (BE (*err != REG_NOERROR, 0))
3222		goto parse_bracket_exp_free_return;
3223	      break;
3224	    case CHAR_CLASS:
3225	      *err = build_charclass (regexp->trans, sbcset,
3226#ifdef RE_ENABLE_I18N
3227				      mbcset, &char_class_alloc,
3228#endif /* RE_ENABLE_I18N */
3229				      start_elem.opr.name, syntax);
3230	      if (BE (*err != REG_NOERROR, 0))
3231	       goto parse_bracket_exp_free_return;
3232	      break;
3233	    default:
3234	      assert (0);
3235	      break;
3236	    }
3237	}
3238      if (BE (token->type == END_OF_RE, 0))
3239	{
3240	  *err = REG_EBRACK;
3241	  goto parse_bracket_exp_free_return;
3242	}
3243      if (token->type == OP_CLOSE_BRACKET)
3244	break;
3245    }
3246
3247  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3248
3249  /* If it is non-matching list.  */
3250  if (non_match)
3251    bitset_not (sbcset);
3252
3253#ifdef RE_ENABLE_I18N
3254  /* Ensure only single byte characters are set.  */
3255  if (dfa->mb_cur_max > 1)
3256    bitset_mask (sbcset, dfa->sb_char);
3257
3258  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3259      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3260						     || mbcset->non_match)))
3261    {
3262      bin_tree_t *mbc_tree;
3263      int sbc_idx;
3264      /* Build a tree for complex bracket.  */
3265      dfa->has_mb_node = 1;
3266      br_token.type = COMPLEX_BRACKET;
3267      br_token.opr.mbcset = mbcset;
3268      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3269      if (BE (mbc_tree == NULL, 0))
3270	goto parse_bracket_exp_espace;
3271      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3272	if (sbcset[sbc_idx])
3273	  break;
3274      /* If there are no bits set in sbcset, there is no point
3275	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3276      if (sbc_idx < BITSET_WORDS)
3277	{
3278          /* Build a tree for simple bracket.  */
3279          br_token.type = SIMPLE_BRACKET;
3280          br_token.opr.sbcset = sbcset;
3281          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282          if (BE (work_tree == NULL, 0))
3283            goto parse_bracket_exp_espace;
3284
3285          /* Then join them by ALT node.  */
3286          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3287          if (BE (work_tree == NULL, 0))
3288            goto parse_bracket_exp_espace;
3289	}
3290      else
3291	{
3292	  re_free (sbcset);
3293	  work_tree = mbc_tree;
3294	}
3295    }
3296  else
3297#endif /* not RE_ENABLE_I18N */
3298    {
3299#ifdef RE_ENABLE_I18N
3300      free_charset (mbcset);
3301#endif
3302      /* Build a tree for simple bracket.  */
3303      br_token.type = SIMPLE_BRACKET;
3304      br_token.opr.sbcset = sbcset;
3305      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3306      if (BE (work_tree == NULL, 0))
3307        goto parse_bracket_exp_espace;
3308    }
3309  return work_tree;
3310
3311 parse_bracket_exp_espace:
3312  *err = REG_ESPACE;
3313 parse_bracket_exp_free_return:
3314  re_free (sbcset);
3315#ifdef RE_ENABLE_I18N
3316  free_charset (mbcset);
3317#endif /* RE_ENABLE_I18N */
3318  return NULL;
3319}
3320
3321/* Parse an element in the bracket expression.  */
3322
3323static reg_errcode_t
3324parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3325		       re_token_t *token, int token_len,
3326		       re_dfa_t *dfa _UNUSED_PARAMETER_,
3327		       reg_syntax_t syntax, bool accept_hyphen)
3328{
3329#ifdef RE_ENABLE_I18N
3330  int cur_char_size;
3331  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3332  if (cur_char_size > 1)
3333    {
3334      elem->type = MB_CHAR;
3335      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3336      re_string_skip_bytes (regexp, cur_char_size);
3337      return REG_NOERROR;
3338    }
3339#endif /* RE_ENABLE_I18N */
3340  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3341  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3342      || token->type == OP_OPEN_EQUIV_CLASS)
3343    return parse_bracket_symbol (elem, regexp, token);
3344  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3345    {
3346      /* A '-' must only appear as anything but a range indicator before
3347	 the closing bracket.  Everything else is an error.  */
3348      re_token_t token2;
3349      (void) peek_token_bracket (&token2, regexp, syntax);
3350      if (token2.type != OP_CLOSE_BRACKET)
3351	/* The actual error value is not standardized since this whole
3352	   case is undefined.  But ERANGE makes good sense.  */
3353	return REG_ERANGE;
3354    }
3355  elem->type = SB_CHAR;
3356  elem->opr.ch = token->opr.c;
3357  return REG_NOERROR;
3358}
3359
3360/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3361   such as [:<character_class>:], [.<collating_element>.], and
3362   [=<equivalent_class>=].  */
3363
3364static reg_errcode_t
3365parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3366		      re_token_t *token)
3367{
3368  unsigned char ch, delim = token->opr.c;
3369  int i = 0;
3370  if (re_string_eoi(regexp))
3371    return REG_EBRACK;
3372  for (;; ++i)
3373    {
3374      if (i >= BRACKET_NAME_BUF_SIZE)
3375	return REG_EBRACK;
3376      if (token->type == OP_OPEN_CHAR_CLASS)
3377	ch = re_string_fetch_byte_case (regexp);
3378      else
3379	ch = re_string_fetch_byte (regexp);
3380      if (re_string_eoi(regexp))
3381	return REG_EBRACK;
3382      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3383	break;
3384      elem->opr.name[i] = ch;
3385    }
3386  re_string_skip_bytes (regexp, 1);
3387  elem->opr.name[i] = '\0';
3388  switch (token->type)
3389    {
3390    case OP_OPEN_COLL_ELEM:
3391      elem->type = COLL_SYM;
3392      break;
3393    case OP_OPEN_EQUIV_CLASS:
3394      elem->type = EQUIV_CLASS;
3395      break;
3396    case OP_OPEN_CHAR_CLASS:
3397      elem->type = CHAR_CLASS;
3398      break;
3399    default:
3400      break;
3401    }
3402  return REG_NOERROR;
3403}
3404
3405  /* Helper function for parse_bracket_exp.
3406     Build the equivalence class which is represented by NAME.
3407     The result are written to MBCSET and SBCSET.
3408     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3409     is a pointer argument sinse we may update it.  */
3410
3411static reg_errcode_t
3412#ifdef RE_ENABLE_I18N
3413build_equiv_class (bitset_t sbcset, re_charset_t *mbcset _UNUSED_PARAMETER_,
3414		   Idx *equiv_class_alloc _UNUSED_PARAMETER_,
3415		   const unsigned char *name)
3416#else /* not RE_ENABLE_I18N */
3417build_equiv_class (bitset_t sbcset, const unsigned char *name)
3418#endif /* not RE_ENABLE_I18N */
3419{
3420#ifdef _LIBC
3421  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3422  if (nrules != 0)
3423    {
3424      const int32_t *table, *indirect;
3425      const unsigned char *weights, *extra, *cp;
3426      unsigned char char_buf[2];
3427      int32_t idx1, idx2;
3428      unsigned int ch;
3429      size_t len;
3430      /* This #include defines a local function!  */
3431# include <locale/weight.h>
3432      /* Calculate the index for equivalence class.  */
3433      cp = name;
3434      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3435      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3436					       _NL_COLLATE_WEIGHTMB);
3437      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3438						   _NL_COLLATE_EXTRAMB);
3439      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3440						_NL_COLLATE_INDIRECTMB);
3441      idx1 = findidx (&cp);
3442      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3443	/* This isn't a valid character.  */
3444	return REG_ECOLLATE;
3445
3446      /* Build single byte matcing table for this equivalence class.  */
3447      char_buf[1] = (unsigned char) '\0';
3448      len = weights[idx1 & 0xffffff];
3449      for (ch = 0; ch < SBC_MAX; ++ch)
3450	{
3451	  char_buf[0] = ch;
3452	  cp = char_buf;
3453	  idx2 = findidx (&cp);
3454/*
3455	  idx2 = table[ch];
3456*/
3457	  if (idx2 == 0)
3458	    /* This isn't a valid character.  */
3459	    continue;
3460	  /* Compare only if the length matches and the collation rule
3461	     index is the same.  */
3462	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3463	    {
3464	      int cnt = 0;
3465
3466	      while (cnt <= len &&
3467		     weights[(idx1 & 0xffffff) + 1 + cnt]
3468		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3469		++cnt;
3470
3471	      if (cnt > len)
3472		bitset_set (sbcset, ch);
3473	    }
3474	}
3475      /* Check whether the array has enough space.  */
3476      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3477	{
3478	  /* Not enough, realloc it.  */
3479	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3480	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3481	  /* Use realloc since the array is NULL if *alloc == 0.  */
3482	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3483						   int32_t,
3484						   new_equiv_class_alloc);
3485	  if (BE (new_equiv_classes == NULL, 0))
3486	    return REG_ESPACE;
3487	  mbcset->equiv_classes = new_equiv_classes;
3488	  *equiv_class_alloc = new_equiv_class_alloc;
3489	}
3490      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3491    }
3492  else
3493#endif /* _LIBC */
3494    {
3495      if (BE (strlen ((const char *) name) != 1, 0))
3496	return REG_ECOLLATE;
3497      bitset_set (sbcset, *name);
3498    }
3499  return REG_NOERROR;
3500}
3501
3502  /* Helper function for parse_bracket_exp.
3503     Build the character class which is represented by NAME.
3504     The result are written to MBCSET and SBCSET.
3505     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3506     is a pointer argument sinse we may update it.  */
3507
3508static reg_errcode_t
3509#ifdef RE_ENABLE_I18N
3510build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3511		 re_charset_t *mbcset, Idx *char_class_alloc,
3512		 const unsigned char *class_name, reg_syntax_t syntax)
3513#else /* not RE_ENABLE_I18N */
3514build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3515		 const unsigned char *class_name, reg_syntax_t syntax)
3516#endif /* not RE_ENABLE_I18N */
3517{
3518  int i;
3519  const char *name = (const char *) class_name;
3520
3521  /* In case of REG_ICASE "upper" and "lower" match the both of
3522     upper and lower cases.  */
3523  if ((syntax & RE_ICASE)
3524      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3525    name = "alpha";
3526
3527#ifdef RE_ENABLE_I18N
3528  /* Check the space of the arrays.  */
3529  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3530    {
3531      /* Not enough, realloc it.  */
3532      /* +1 in case of mbcset->nchar_classes is 0.  */
3533      Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3534      /* Use realloc since array is NULL if *alloc == 0.  */
3535      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3536					       new_char_class_alloc);
3537      if (BE (new_char_classes == NULL, 0))
3538	return REG_ESPACE;
3539      mbcset->char_classes = new_char_classes;
3540      *char_class_alloc = new_char_class_alloc;
3541    }
3542  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3543#endif /* RE_ENABLE_I18N */
3544
3545#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3546  do {						\
3547    if (BE (trans != NULL, 0))			\
3548      {						\
3549	for (i = 0; i < SBC_MAX; ++i)		\
3550	  if (ctype_func (i))			\
3551	    bitset_set (sbcset, trans[i]);	\
3552      }						\
3553    else					\
3554      {						\
3555	for (i = 0; i < SBC_MAX; ++i)		\
3556	  if (ctype_func (i))			\
3557	    bitset_set (sbcset, i);		\
3558      }						\
3559  } while (0)
3560
3561  if (strcmp (name, "alnum") == 0)
3562    BUILD_CHARCLASS_LOOP (isalnum);
3563  else if (strcmp (name, "cntrl") == 0)
3564    BUILD_CHARCLASS_LOOP (iscntrl);
3565  else if (strcmp (name, "lower") == 0)
3566    BUILD_CHARCLASS_LOOP (islower);
3567  else if (strcmp (name, "space") == 0)
3568    BUILD_CHARCLASS_LOOP (isspace);
3569  else if (strcmp (name, "alpha") == 0)
3570    BUILD_CHARCLASS_LOOP (isalpha);
3571  else if (strcmp (name, "digit") == 0)
3572    BUILD_CHARCLASS_LOOP (isdigit);
3573  else if (strcmp (name, "print") == 0)
3574    BUILD_CHARCLASS_LOOP (isprint);
3575  else if (strcmp (name, "upper") == 0)
3576    BUILD_CHARCLASS_LOOP (isupper);
3577  else if (strcmp (name, "blank") == 0)
3578    BUILD_CHARCLASS_LOOP (isblank);
3579  else if (strcmp (name, "graph") == 0)
3580    BUILD_CHARCLASS_LOOP (isgraph);
3581  else if (strcmp (name, "punct") == 0)
3582    BUILD_CHARCLASS_LOOP (ispunct);
3583  else if (strcmp (name, "xdigit") == 0)
3584    BUILD_CHARCLASS_LOOP (isxdigit);
3585  else
3586    return REG_ECTYPE;
3587
3588  return REG_NOERROR;
3589}
3590
3591static bin_tree_t *
3592build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3593		    const unsigned char *class_name,
3594		    const unsigned char *extra, bool non_match,
3595		    reg_errcode_t *err)
3596{
3597  re_bitset_ptr_t sbcset;
3598#ifdef RE_ENABLE_I18N
3599  re_charset_t *mbcset;
3600  Idx alloc = 0;
3601#endif /* not RE_ENABLE_I18N */
3602  reg_errcode_t ret;
3603  re_token_t br_token;
3604  bin_tree_t *tree;
3605
3606  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3607#ifdef RE_ENABLE_I18N
3608  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3609#endif /* RE_ENABLE_I18N */
3610
3611#ifdef RE_ENABLE_I18N
3612  if (BE (sbcset == NULL || mbcset == NULL, 0))
3613#else /* not RE_ENABLE_I18N */
3614  if (BE (sbcset == NULL, 0))
3615#endif /* not RE_ENABLE_I18N */
3616    {
3617      *err = REG_ESPACE;
3618      return NULL;
3619    }
3620
3621  if (non_match)
3622    {
3623#ifdef RE_ENABLE_I18N
3624      mbcset->non_match = 1;
3625#endif /* not RE_ENABLE_I18N */
3626    }
3627
3628  /* We don't care the syntax in this case.  */
3629  ret = build_charclass (trans, sbcset,
3630#ifdef RE_ENABLE_I18N
3631			 mbcset, &alloc,
3632#endif /* RE_ENABLE_I18N */
3633			 class_name, 0);
3634
3635  if (BE (ret != REG_NOERROR, 0))
3636    {
3637      re_free (sbcset);
3638#ifdef RE_ENABLE_I18N
3639      free_charset (mbcset);
3640#endif /* RE_ENABLE_I18N */
3641      *err = ret;
3642      return NULL;
3643    }
3644  /* \w match '_' also.  */
3645  for (; *extra; extra++)
3646    bitset_set (sbcset, *extra);
3647
3648  /* If it is non-matching list.  */
3649  if (non_match)
3650    bitset_not (sbcset);
3651
3652#ifdef RE_ENABLE_I18N
3653  /* Ensure only single byte characters are set.  */
3654  if (dfa->mb_cur_max > 1)
3655    bitset_mask (sbcset, dfa->sb_char);
3656#endif
3657
3658  /* Build a tree for simple bracket.  */
3659  br_token.type = SIMPLE_BRACKET;
3660  br_token.opr.sbcset = sbcset;
3661  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3662  if (BE (tree == NULL, 0))
3663    goto build_word_op_espace;
3664
3665#ifdef RE_ENABLE_I18N
3666  if (dfa->mb_cur_max > 1)
3667    {
3668      bin_tree_t *mbc_tree;
3669      /* Build a tree for complex bracket.  */
3670      br_token.type = COMPLEX_BRACKET;
3671      br_token.opr.mbcset = mbcset;
3672      dfa->has_mb_node = 1;
3673      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3674      if (BE (mbc_tree == NULL, 0))
3675	goto build_word_op_espace;
3676      /* Then join them by ALT node.  */
3677      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3678      if (BE (mbc_tree != NULL, 1))
3679	return tree;
3680    }
3681  else
3682    {
3683      free_charset (mbcset);
3684      return tree;
3685    }
3686#else /* not RE_ENABLE_I18N */
3687  return tree;
3688#endif /* not RE_ENABLE_I18N */
3689
3690 build_word_op_espace:
3691  re_free (sbcset);
3692#ifdef RE_ENABLE_I18N
3693  free_charset (mbcset);
3694#endif /* RE_ENABLE_I18N */
3695  *err = REG_ESPACE;
3696  return NULL;
3697}
3698
3699/* This is intended for the expressions like "a{1,3}".
3700   Fetch a number from `input', and return the number.
3701   Return REG_MISSING if the number field is empty like "{,1}".
3702   Return REG_ERROR if an error occurred.  */
3703
3704static Idx
3705fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3706{
3707  Idx num = REG_MISSING;
3708  unsigned char c;
3709  while (1)
3710    {
3711      fetch_token (token, input, syntax);
3712      c = token->opr.c;
3713      if (BE (token->type == END_OF_RE, 0))
3714	return REG_ERROR;
3715      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3716	break;
3717      num = ((token->type != CHARACTER || c < '0' || '9' < c
3718	      || num == REG_ERROR)
3719	     ? REG_ERROR
3720	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3721      num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3722    }
3723  return num;
3724}
3725
3726#ifdef RE_ENABLE_I18N
3727static void
3728free_charset (re_charset_t *cset)
3729{
3730  re_free (cset->mbchars);
3731# ifdef _LIBC
3732  re_free (cset->coll_syms);
3733  re_free (cset->equiv_classes);
3734  re_free (cset->range_starts);
3735  re_free (cset->range_ends);
3736# endif
3737  re_free (cset->char_classes);
3738  re_free (cset);
3739}
3740#endif /* RE_ENABLE_I18N */
3741
3742/* Functions for binary tree operation.  */
3743
3744/* Create a tree node.  */
3745
3746static bin_tree_t *
3747create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3748	     re_token_type_t type)
3749{
3750  re_token_t t;
3751  t.type = type;
3752  return create_token_tree (dfa, left, right, &t);
3753}
3754
3755static bin_tree_t *
3756create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3757		   const re_token_t *token)
3758{
3759  bin_tree_t *tree;
3760  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3761    {
3762      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3763
3764      if (storage == NULL)
3765	return NULL;
3766      storage->next = dfa->str_tree_storage;
3767      dfa->str_tree_storage = storage;
3768      dfa->str_tree_storage_idx = 0;
3769    }
3770  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3771
3772  tree->parent = NULL;
3773  tree->left = left;
3774  tree->right = right;
3775  tree->token = *token;
3776  tree->token.duplicated = 0;
3777  tree->token.opt_subexp = 0;
3778  tree->first = NULL;
3779  tree->next = NULL;
3780  tree->node_idx = REG_MISSING;
3781
3782  if (left != NULL)
3783    left->parent = tree;
3784  if (right != NULL)
3785    right->parent = tree;
3786  return tree;
3787}
3788
3789/* Mark the tree SRC as an optional subexpression.
3790   To be called from preorder or postorder.  */
3791
3792static reg_errcode_t
3793mark_opt_subexp (void *extra, bin_tree_t *node)
3794{
3795  Idx idx = (Idx) (long) extra;
3796  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3797    node->token.opt_subexp = 1;
3798
3799  return REG_NOERROR;
3800}
3801
3802/* Free the allocated memory inside NODE. */
3803
3804static void
3805free_token (re_token_t *node)
3806{
3807#ifdef RE_ENABLE_I18N
3808  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3809    free_charset (node->opr.mbcset);
3810  else
3811#endif /* RE_ENABLE_I18N */
3812    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3813      re_free (node->opr.sbcset);
3814}
3815
3816/* Worker function for tree walking.  Free the allocated memory inside NODE
3817   and its children. */
3818
3819static reg_errcode_t
3820free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
3821{
3822  free_token (&node->token);
3823  return REG_NOERROR;
3824}
3825
3826
3827/* Duplicate the node SRC, and return new node.  This is a preorder
3828   visit similar to the one implemented by the generic visitor, but
3829   we need more infrastructure to maintain two parallel trees --- so,
3830   it's easier to duplicate.  */
3831
3832static bin_tree_t *
3833duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3834{
3835  const bin_tree_t *node;
3836  bin_tree_t *dup_root;
3837  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3838
3839  for (node = root; ; )
3840    {
3841      /* Create a new tree and link it back to the current parent.  */
3842      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3843      if (*p_new == NULL)
3844	return NULL;
3845      (*p_new)->parent = dup_node;
3846      (*p_new)->token.duplicated = 1;
3847      dup_node = *p_new;
3848
3849      /* Go to the left node, or up and to the right.  */
3850      if (node->left)
3851	{
3852	  node = node->left;
3853	  p_new = &dup_node->left;
3854	}
3855      else
3856	{
3857	  const bin_tree_t *prev = NULL;
3858	  while (node->right == prev || node->right == NULL)
3859	    {
3860	      prev = node;
3861	      node = node->parent;
3862	      dup_node = dup_node->parent;
3863	      if (!node)
3864	        return dup_root;
3865	    }
3866	  node = node->right;
3867	  p_new = &dup_node->right;
3868	}
3869    }
3870}
3871