regcomp.c revision 250723
1/* Extended regular expression matching and search library.
2   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, write to the Free
18   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19   02111-1307 USA.  */
20
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22					  int length, reg_syntax_t syntax);
23static void re_compile_fastmap_iter (regex_t *bufp,
24				     const re_dfastate_t *init_state,
25				     char *fastmap);
26static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27static void init_word_char (re_dfa_t *dfa);
28#ifdef RE_ENABLE_I18N
29static void free_charset (re_charset_t *cset);
30#endif /* RE_ENABLE_I18N */
31static void free_workarea_compile (regex_t *preg);
32static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33#ifdef RE_ENABLE_I18N
34static void optimize_utf8 (re_dfa_t *dfa);
35#endif
36static reg_errcode_t analyze (regex_t *preg);
37static reg_errcode_t create_initial_state (re_dfa_t *dfa);
38static reg_errcode_t preorder (bin_tree_t *root,
39			       reg_errcode_t (fn (void *, bin_tree_t *)),
40			       void *extra);
41static reg_errcode_t postorder (bin_tree_t *root,
42				reg_errcode_t (fn (void *, bin_tree_t *)),
43				void *extra);
44static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47				 bin_tree_t *node);
48static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
52					     int top_clone_node, int root_node,
53					     unsigned int constraint);
54static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
55				     unsigned int constraint);
56static int search_duplicated_node (re_dfa_t *dfa, int org_node,
57				   unsigned int constraint);
58static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
59static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
60					 int node, int root);
61static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
62static int fetch_number (re_string_t *input, re_token_t *token,
63			 reg_syntax_t syntax);
64static void fetch_token (re_token_t *result, re_string_t *input,
65			 reg_syntax_t syntax);
66static int peek_token (re_token_t *token, re_string_t *input,
67			reg_syntax_t syntax);
68static int peek_token_bracket (re_token_t *token, re_string_t *input,
69			       reg_syntax_t syntax);
70static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
71			  reg_syntax_t syntax, reg_errcode_t *err);
72static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
73				  re_token_t *token, reg_syntax_t syntax,
74				  int nest, reg_errcode_t *err);
75static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
76				 re_token_t *token, reg_syntax_t syntax,
77				 int nest, reg_errcode_t *err);
78static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
79				     re_token_t *token, reg_syntax_t syntax,
80				     int nest, reg_errcode_t *err);
81static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
82				  re_token_t *token, reg_syntax_t syntax,
83				  int nest, reg_errcode_t *err);
84static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
85				 re_dfa_t *dfa, re_token_t *token,
86				 reg_syntax_t syntax, reg_errcode_t *err);
87static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
88				      re_token_t *token, reg_syntax_t syntax,
89				      reg_errcode_t *err);
90static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
91					    re_string_t *regexp,
92					    re_token_t *token, int token_len,
93					    re_dfa_t *dfa,
94					    reg_syntax_t syntax,
95					    int accept_hyphen);
96static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97					  re_string_t *regexp,
98					  re_token_t *token);
99#ifndef _LIBC
100# ifdef RE_ENABLE_I18N
101static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
102				      re_charset_t *mbcset, int *range_alloc,
103				      bracket_elem_t *start_elem,
104				      bracket_elem_t *end_elem);
105static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
106					     re_charset_t *mbcset,
107					     int *coll_sym_alloc,
108					     const unsigned char *name);
109# else /* not RE_ENABLE_I18N */
110static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
111				      bracket_elem_t *start_elem,
112				      bracket_elem_t *end_elem);
113static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
114					     const unsigned char *name);
115# endif /* not RE_ENABLE_I18N */
116#endif /* not _LIBC */
117#ifdef RE_ENABLE_I18N
118static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119					re_charset_t *mbcset,
120					int *equiv_class_alloc,
121					const unsigned char *name);
122static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
123				      re_bitset_ptr_t sbcset,
124				      re_charset_t *mbcset,
125				      int *char_class_alloc,
126				      const unsigned char *class_name,
127				      reg_syntax_t syntax);
128#else  /* not RE_ENABLE_I18N */
129static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
130					const unsigned char *name);
131static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
132				      re_bitset_ptr_t sbcset,
133				      const unsigned char *class_name,
134				      reg_syntax_t syntax);
135#endif /* not RE_ENABLE_I18N */
136static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
137				       unsigned RE_TRANSLATE_TYPE trans,
138				       const unsigned char *class_name,
139				       const unsigned char *extra,
140				       int non_match, reg_errcode_t *err);
141static bin_tree_t *create_tree (re_dfa_t *dfa,
142				bin_tree_t *left, bin_tree_t *right,
143				re_token_type_t type);
144static bin_tree_t *create_token_tree (re_dfa_t *dfa,
145				      bin_tree_t *left, bin_tree_t *right,
146				      const re_token_t *token);
147static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
148static void free_token (re_token_t *node);
149static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
150static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
151
152/* This table gives an error message for each of the error codes listed
153   in regex.h.  Obviously the order here has to be same as there.
154   POSIX doesn't require that we do anything for REG_NOERROR,
155   but why not be nice?  */
156
157const char __re_error_msgid[] attribute_hidden =
158  {
159#define REG_NOERROR_IDX	0
160    gettext_noop ("Success")	/* REG_NOERROR */
161    "\0"
162#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
163    gettext_noop ("No match")	/* REG_NOMATCH */
164    "\0"
165#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
166    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
167    "\0"
168#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
169    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
170    "\0"
171#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
172    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
173    "\0"
174#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
175    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
176    "\0"
177#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
178    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
179    "\0"
180#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
181    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
182    "\0"
183#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
184    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
185    "\0"
186#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
187    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
188    "\0"
189#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
190    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
191    "\0"
192#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
193    gettext_noop ("Invalid range end")	/* REG_ERANGE */
194    "\0"
195#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
196    gettext_noop ("Memory exhausted") /* REG_ESPACE */
197    "\0"
198#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
199    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
200    "\0"
201#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
202    gettext_noop ("Premature end of regular expression") /* REG_EEND */
203    "\0"
204#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
205    gettext_noop ("Regular expression too big") /* REG_ESIZE */
206    "\0"
207#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
208    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209  };
210
211const size_t __re_error_msgid_idx[] attribute_hidden =
212  {
213    REG_NOERROR_IDX,
214    REG_NOMATCH_IDX,
215    REG_BADPAT_IDX,
216    REG_ECOLLATE_IDX,
217    REG_ECTYPE_IDX,
218    REG_EESCAPE_IDX,
219    REG_ESUBREG_IDX,
220    REG_EBRACK_IDX,
221    REG_EPAREN_IDX,
222    REG_EBRACE_IDX,
223    REG_BADBR_IDX,
224    REG_ERANGE_IDX,
225    REG_ESPACE_IDX,
226    REG_BADRPT_IDX,
227    REG_EEND_IDX,
228    REG_ESIZE_IDX,
229    REG_ERPAREN_IDX
230  };
231
232/* Entry points for GNU code.  */
233
234/* re_compile_pattern is the GNU regular expression compiler: it
235   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
236   Returns 0 if the pattern was valid, otherwise an error string.
237
238   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
239   are set in BUFP on entry.  */
240
241const char *
242re_compile_pattern (pattern, length, bufp)
243    const char *pattern;
244    size_t length;
245    struct re_pattern_buffer *bufp;
246{
247  reg_errcode_t ret;
248
249  /* And GNU code determines whether or not to get register information
250     by passing null for the REGS argument to re_match, etc., not by
251     setting no_sub, unless RE_NO_SUB is set.  */
252  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
253
254  /* Match anchors at newline.  */
255  bufp->newline_anchor = 1;
256
257  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
258
259  if (!ret)
260    return NULL;
261  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
262}
263#ifdef _LIBC
264weak_alias (__re_compile_pattern, re_compile_pattern)
265#endif
266
267/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
268   also be assigned to arbitrarily: each pattern buffer stores its own
269   syntax, so it can be changed between regex compilations.  */
270/* This has no initializer because initialized variables in Emacs
271   become read-only after dumping.  */
272reg_syntax_t re_syntax_options;
273
274
275/* Specify the precise syntax of regexps for compilation.  This provides
276   for compatibility for various utilities which historically have
277   different, incompatible syntaxes.
278
279   The argument SYNTAX is a bit mask comprised of the various bits
280   defined in regex.h.  We return the old syntax.  */
281
282reg_syntax_t
283re_set_syntax (syntax)
284    reg_syntax_t syntax;
285{
286  reg_syntax_t ret = re_syntax_options;
287
288  re_syntax_options = syntax;
289  return ret;
290}
291#ifdef _LIBC
292weak_alias (__re_set_syntax, re_set_syntax)
293#endif
294
295int
296re_compile_fastmap (bufp)
297    struct re_pattern_buffer *bufp;
298{
299  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
300  char *fastmap = bufp->fastmap;
301
302  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
303  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
304  if (dfa->init_state != dfa->init_state_word)
305    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
306  if (dfa->init_state != dfa->init_state_nl)
307    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
308  if (dfa->init_state != dfa->init_state_begbuf)
309    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
310  bufp->fastmap_accurate = 1;
311  return 0;
312}
313#ifdef _LIBC
314weak_alias (__re_compile_fastmap, re_compile_fastmap)
315#endif
316
317static inline void
318__attribute ((always_inline))
319re_set_fastmap (char *fastmap, int icase, int ch)
320{
321  fastmap[ch] = 1;
322  if (icase)
323    fastmap[tolower (ch)] = 1;
324}
325
326/* Helper function for re_compile_fastmap.
327   Compile fastmap for the initial_state INIT_STATE.  */
328
329static void
330re_compile_fastmap_iter (bufp, init_state, fastmap)
331     regex_t *bufp;
332     const re_dfastate_t *init_state;
333     char *fastmap;
334{
335  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
336  int node_cnt;
337  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
338  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
339    {
340      int node = init_state->nodes.elems[node_cnt];
341      re_token_type_t type = dfa->nodes[node].type;
342
343      if (type == CHARACTER)
344	{
345	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
346#ifdef RE_ENABLE_I18N
347	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
348	    {
349	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
350	      wchar_t wc;
351	      mbstate_t state;
352
353	      p = buf;
354	      *p++ = dfa->nodes[node].opr.c;
355	      while (++node < dfa->nodes_len
356		     &&	dfa->nodes[node].type == CHARACTER
357		     && dfa->nodes[node].mb_partial)
358		*p++ = dfa->nodes[node].opr.c;
359	      memset (&state, 0, sizeof (state));
360	      if (mbrtowc (&wc, (const char *) buf, p - buf,
361			   &state) == p - buf
362		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
363		      != (size_t) -1))
364		re_set_fastmap (fastmap, 0, buf[0]);
365	    }
366#endif
367	}
368      else if (type == SIMPLE_BRACKET)
369	{
370	  int i, j, ch;
371	  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
372	    for (j = 0; j < UINT_BITS; ++j, ++ch)
373	      if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
374		re_set_fastmap (fastmap, icase, ch);
375	}
376#ifdef RE_ENABLE_I18N
377      else if (type == COMPLEX_BRACKET)
378	{
379	  int i;
380	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
381	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
382	      || cset->nranges || cset->nchar_classes)
383	    {
384# ifdef _LIBC
385	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
386		{
387		  /* In this case we want to catch the bytes which are
388		     the first byte of any collation elements.
389		     e.g. In da_DK, we want to catch 'a' since "aa"
390			  is a valid collation element, and don't catch
391			  'b' since 'b' is the only collation element
392			  which starts from 'b'.  */
393		  int j, ch;
394		  const int32_t *table = (const int32_t *)
395		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
396		  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
397		    for (j = 0; j < UINT_BITS; ++j, ++ch)
398		      if (table[ch] < 0)
399			re_set_fastmap (fastmap, icase, ch);
400		}
401# else
402	      if (dfa->mb_cur_max > 1)
403		for (i = 0; i < SBC_MAX; ++i)
404		  if (__btowc (i) == WEOF)
405		    re_set_fastmap (fastmap, icase, i);
406# endif /* not _LIBC */
407	    }
408	  for (i = 0; i < cset->nmbchars; ++i)
409	    {
410	      char buf[256];
411	      mbstate_t state;
412	      memset (&state, '\0', sizeof (state));
413	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
414		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
415	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
416		{
417		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
418		      != (size_t) -1)
419		    re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
420		}
421	    }
422	}
423#endif /* RE_ENABLE_I18N */
424      else if (type == OP_PERIOD
425#ifdef RE_ENABLE_I18N
426	       || type == OP_UTF8_PERIOD
427#endif /* RE_ENABLE_I18N */
428	       || type == END_OF_RE)
429	{
430	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
431	  if (type == END_OF_RE)
432	    bufp->can_be_null = 1;
433	  return;
434	}
435    }
436}
437
438/* Entry point for POSIX code.  */
439/* regcomp takes a regular expression as a string and compiles it.
440
441   PREG is a regex_t *.  We do not expect any fields to be initialized,
442   since POSIX says we shouldn't.  Thus, we set
443
444     `buffer' to the compiled pattern;
445     `used' to the length of the compiled pattern;
446     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
447       REG_EXTENDED bit in CFLAGS is set; otherwise, to
448       RE_SYNTAX_POSIX_BASIC;
449     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
450     `fastmap' to an allocated space for the fastmap;
451     `fastmap_accurate' to zero;
452     `re_nsub' to the number of subexpressions in PATTERN.
453
454   PATTERN is the address of the pattern string.
455
456   CFLAGS is a series of bits which affect compilation.
457
458     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
459     use POSIX basic syntax.
460
461     If REG_NEWLINE is set, then . and [^...] don't match newline.
462     Also, regexec will try a match beginning after every newline.
463
464     If REG_ICASE is set, then we considers upper- and lowercase
465     versions of letters to be equivalent when matching.
466
467     If REG_NOSUB is set, then when PREG is passed to regexec, that
468     routine will report only success or failure, and nothing about the
469     registers.
470
471   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
472   the return codes and their meanings.)  */
473
474int
475regcomp (preg, pattern, cflags)
476    regex_t *__restrict preg;
477    const char *__restrict pattern;
478    int cflags;
479{
480  reg_errcode_t ret;
481  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
482			 : RE_SYNTAX_POSIX_BASIC);
483
484  preg->buffer = NULL;
485  preg->allocated = 0;
486  preg->used = 0;
487
488  /* Try to allocate space for the fastmap.  */
489  preg->fastmap = re_malloc (char, SBC_MAX);
490  if (BE (preg->fastmap == NULL, 0))
491    return REG_ESPACE;
492
493  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494
495  /* If REG_NEWLINE is set, newlines are treated differently.  */
496  if (cflags & REG_NEWLINE)
497    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
498      syntax &= ~RE_DOT_NEWLINE;
499      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
500      /* It also changes the matching behavior.  */
501      preg->newline_anchor = 1;
502    }
503  else
504    preg->newline_anchor = 0;
505  preg->no_sub = !!(cflags & REG_NOSUB);
506  preg->translate = NULL;
507
508  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509
510  /* POSIX doesn't distinguish between an unmatched open-group and an
511     unmatched close-group: both are REG_EPAREN.  */
512  if (ret == REG_ERPAREN)
513    ret = REG_EPAREN;
514
515  /* We have already checked preg->fastmap != NULL.  */
516  if (BE (ret == REG_NOERROR, 1))
517    /* Compute the fastmap now, since regexec cannot modify the pattern
518       buffer.  This function never fails in this implementation.  */
519    (void) re_compile_fastmap (preg);
520  else
521    {
522      /* Some error occurred while compiling the expression.  */
523      re_free (preg->fastmap);
524      preg->fastmap = NULL;
525    }
526
527  return (int) ret;
528}
529#ifdef _LIBC
530weak_alias (__regcomp, regcomp)
531#endif
532
533/* Returns a message corresponding to an error code, ERRCODE, returned
534   from either regcomp or regexec.   We don't use PREG here.  */
535
536size_t
537regerror (errcode, preg, errbuf, errbuf_size)
538    int errcode;
539    const regex_t *preg;
540    char *errbuf;
541    size_t errbuf_size;
542{
543  const char *msg;
544  size_t msg_size;
545
546  if (BE (errcode < 0
547	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
548			       / sizeof (__re_error_msgid_idx[0])), 0))
549    /* Only error codes returned by the rest of the code should be passed
550       to this routine.  If we are given anything else, or if other regex
551       code generates an invalid error code, then the program has a bug.
552       Dump core so we can fix it.  */
553    abort ();
554
555  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
556
557  msg_size = strlen (msg) + 1; /* Includes the null.  */
558
559  if (BE (errbuf_size != 0, 1))
560    {
561      if (BE (msg_size > errbuf_size, 0))
562	{
563#if defined HAVE_MEMPCPY || defined _LIBC
564	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
565#else
566	  memcpy (errbuf, msg, errbuf_size - 1);
567	  errbuf[errbuf_size - 1] = 0;
568#endif
569	}
570      else
571	memcpy (errbuf, msg, msg_size);
572    }
573
574  return msg_size;
575}
576#ifdef _LIBC
577weak_alias (__regerror, regerror)
578#endif
579
580
581#ifdef RE_ENABLE_I18N
582/* This static array is used for the map to single-byte characters when
583   UTF-8 is used.  Otherwise we would allocate memory just to initialize
584   it the same all the time.  UTF-8 is the preferred encoding so this is
585   a worthwhile optimization.  */
586static const bitset utf8_sb_map =
587{
588  /* Set the first 128 bits.  */
589# if UINT_MAX == 0xffffffff
590  0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
591# else
592#  error "Add case for new unsigned int size"
593# endif
594};
595#endif
596
597
598static void
599free_dfa_content (re_dfa_t *dfa)
600{
601  int i, j;
602
603  if (dfa->nodes)
604    for (i = 0; i < dfa->nodes_len; ++i)
605      free_token (dfa->nodes + i);
606  re_free (dfa->nexts);
607  for (i = 0; i < dfa->nodes_len; ++i)
608    {
609      if (dfa->eclosures != NULL)
610	re_node_set_free (dfa->eclosures + i);
611      if (dfa->inveclosures != NULL)
612	re_node_set_free (dfa->inveclosures + i);
613      if (dfa->edests != NULL)
614	re_node_set_free (dfa->edests + i);
615    }
616  re_free (dfa->edests);
617  re_free (dfa->eclosures);
618  re_free (dfa->inveclosures);
619  re_free (dfa->nodes);
620
621  if (dfa->state_table)
622    for (i = 0; i <= dfa->state_hash_mask; ++i)
623      {
624	struct re_state_table_entry *entry = dfa->state_table + i;
625	for (j = 0; j < entry->num; ++j)
626	  {
627	    re_dfastate_t *state = entry->array[j];
628	    free_state (state);
629	  }
630        re_free (entry->array);
631      }
632  re_free (dfa->state_table);
633#ifdef RE_ENABLE_I18N
634  if (dfa->sb_char != utf8_sb_map)
635    re_free (dfa->sb_char);
636#endif
637  re_free (dfa->subexp_map);
638#ifdef DEBUG
639  re_free (dfa->re_str);
640#endif
641
642  re_free (dfa);
643}
644
645
646/* Free dynamically allocated space used by PREG.  */
647
648void
649regfree (preg)
650    regex_t *preg;
651{
652  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
653  if (BE (dfa != NULL, 1))
654    free_dfa_content (dfa);
655  preg->buffer = NULL;
656  preg->allocated = 0;
657
658  re_free (preg->fastmap);
659  preg->fastmap = NULL;
660
661  re_free (preg->translate);
662  preg->translate = NULL;
663}
664#ifdef _LIBC
665weak_alias (__regfree, regfree)
666#endif
667
668/* Entry points compatible with 4.2 BSD regex library.  We don't define
669   them unless specifically requested.  */
670
671#if defined _REGEX_RE_COMP || defined _LIBC
672
673/* BSD has one and only one pattern buffer.  */
674static struct re_pattern_buffer re_comp_buf;
675
676char *
677# ifdef _LIBC
678/* Make these definitions weak in libc, so POSIX programs can redefine
679   these names if they don't use our functions, and still use
680   regcomp/regexec above without link errors.  */
681weak_function
682# endif
683re_comp (s)
684     const char *s;
685{
686  reg_errcode_t ret;
687  char *fastmap;
688
689  if (!s)
690    {
691      if (!re_comp_buf.buffer)
692	return gettext ("No previous regular expression");
693      return 0;
694    }
695
696  if (re_comp_buf.buffer)
697    {
698      fastmap = re_comp_buf.fastmap;
699      re_comp_buf.fastmap = NULL;
700      __regfree (&re_comp_buf);
701      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
702      re_comp_buf.fastmap = fastmap;
703    }
704
705  if (re_comp_buf.fastmap == NULL)
706    {
707      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
708      if (re_comp_buf.fastmap == NULL)
709	return (char *) gettext (__re_error_msgid
710				 + __re_error_msgid_idx[(int) REG_ESPACE]);
711    }
712
713  /* Since `re_exec' always passes NULL for the `regs' argument, we
714     don't need to initialize the pattern buffer fields which affect it.  */
715
716  /* Match anchors at newlines.  */
717  re_comp_buf.newline_anchor = 1;
718
719  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
720
721  if (!ret)
722    return NULL;
723
724  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
725  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
726}
727
728#ifdef _LIBC
729libc_freeres_fn (free_mem)
730{
731  __regfree (&re_comp_buf);
732}
733#endif
734
735#endif /* _REGEX_RE_COMP */
736
737/* Internal entry point.
738   Compile the regular expression PATTERN, whose length is LENGTH.
739   SYNTAX indicate regular expression's syntax.  */
740
741static reg_errcode_t
742re_compile_internal (preg, pattern, length, syntax)
743     regex_t *preg;
744     const char * pattern;
745     int length;
746     reg_syntax_t syntax;
747{
748  reg_errcode_t err = REG_NOERROR;
749  re_dfa_t *dfa;
750  re_string_t regexp;
751
752  /* Initialize the pattern buffer.  */
753  preg->fastmap_accurate = 0;
754  preg->syntax = syntax;
755  preg->not_bol = preg->not_eol = 0;
756  preg->used = 0;
757  preg->re_nsub = 0;
758  preg->can_be_null = 0;
759  preg->regs_allocated = REGS_UNALLOCATED;
760
761  /* Initialize the dfa.  */
762  dfa = (re_dfa_t *) preg->buffer;
763  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
764    {
765      /* If zero allocated, but buffer is non-null, try to realloc
766	 enough space.  This loses if buffer's address is bogus, but
767	 that is the user's responsibility.  If ->buffer is NULL this
768	 is a simple allocation.  */
769      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
770      if (dfa == NULL)
771	return REG_ESPACE;
772      preg->allocated = sizeof (re_dfa_t);
773      preg->buffer = (unsigned char *) dfa;
774    }
775  preg->used = sizeof (re_dfa_t);
776
777  err = init_dfa (dfa, length);
778  if (BE (err != REG_NOERROR, 0))
779    {
780      free_dfa_content (dfa);
781      preg->buffer = NULL;
782      preg->allocated = 0;
783      return err;
784    }
785#ifdef DEBUG
786  dfa->re_str = re_malloc (char, length + 1);
787  strncpy (dfa->re_str, pattern, length + 1);
788#endif
789
790  err = re_string_construct (&regexp, pattern, length, preg->translate,
791			     syntax & RE_ICASE, dfa);
792  if (BE (err != REG_NOERROR, 0))
793    {
794    re_compile_internal_free_return:
795      free_workarea_compile (preg);
796      re_string_destruct (&regexp);
797      free_dfa_content (dfa);
798      preg->buffer = NULL;
799      preg->allocated = 0;
800      return err;
801    }
802
803  /* Parse the regular expression, and build a structure tree.  */
804  preg->re_nsub = 0;
805  dfa->str_tree = parse (&regexp, preg, syntax, &err);
806  if (BE (dfa->str_tree == NULL, 0))
807    goto re_compile_internal_free_return;
808
809  /* Analyze the tree and create the nfa.  */
810  err = analyze (preg);
811  if (BE (err != REG_NOERROR, 0))
812    goto re_compile_internal_free_return;
813
814#ifdef RE_ENABLE_I18N
815  /* If possible, do searching in single byte encoding to speed things up.  */
816  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
817    optimize_utf8 (dfa);
818#endif
819
820  /* Then create the initial state of the dfa.  */
821  err = create_initial_state (dfa);
822
823  /* Release work areas.  */
824  free_workarea_compile (preg);
825  re_string_destruct (&regexp);
826
827  if (BE (err != REG_NOERROR, 0))
828    {
829      free_dfa_content (dfa);
830      preg->buffer = NULL;
831      preg->allocated = 0;
832    }
833
834  return err;
835}
836
837/* Initialize DFA.  We use the length of the regular expression PAT_LEN
838   as the initial length of some arrays.  */
839
840static reg_errcode_t
841init_dfa (dfa, pat_len)
842     re_dfa_t *dfa;
843     int pat_len;
844{
845  int table_size;
846#ifndef _LIBC
847  char *codeset_name;
848#endif
849
850  memset (dfa, '\0', sizeof (re_dfa_t));
851
852  /* Force allocation of str_tree_storage the first time.  */
853  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854
855  dfa->nodes_alloc = pat_len + 1;
856  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
857
858  dfa->states_alloc = pat_len + 1;
859
860  /*  table_size = 2 ^ ceil(log pat_len) */
861  for (table_size = 1; table_size > 0; table_size <<= 1)
862    if (table_size > pat_len)
863      break;
864
865  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
866  dfa->state_hash_mask = table_size - 1;
867
868  dfa->mb_cur_max = MB_CUR_MAX;
869#ifdef _LIBC
870  if (dfa->mb_cur_max == 6
871      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
872    dfa->is_utf8 = 1;
873  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
874		       != 0);
875#else
876# ifdef HAVE_LANGINFO_CODESET
877  codeset_name = nl_langinfo (CODESET);
878# else
879  codeset_name = getenv ("LC_ALL");
880  if (codeset_name == NULL || codeset_name[0] == '\0')
881    codeset_name = getenv ("LC_CTYPE");
882  if (codeset_name == NULL || codeset_name[0] == '\0')
883    codeset_name = getenv ("LANG");
884  if (codeset_name == NULL)
885    codeset_name = "";
886  else if (strchr (codeset_name, '.') !=  NULL)
887    codeset_name = strchr (codeset_name, '.') + 1;
888# endif
889
890  if (strcasecmp (codeset_name, "UTF-8") == 0
891      || strcasecmp (codeset_name, "UTF8") == 0)
892    dfa->is_utf8 = 1;
893
894  /* We check exhaustively in the loop below if this charset is a
895     superset of ASCII.  */
896  dfa->map_notascii = 0;
897#endif
898
899#ifdef RE_ENABLE_I18N
900  if (dfa->mb_cur_max > 1)
901    {
902      if (dfa->is_utf8)
903	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
904      else
905	{
906	  int i, j, ch;
907
908	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
909	  if (BE (dfa->sb_char == NULL, 0))
910	    return REG_ESPACE;
911
912	  /* Clear all bits by, then set those corresponding to single
913	     byte chars.  */
914	  bitset_empty (dfa->sb_char);
915
916	  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
917	    for (j = 0; j < UINT_BITS; ++j, ++ch)
918	      {
919		wchar_t wch = __btowc (ch);
920		if (wch != WEOF)
921		  dfa->sb_char[i] |= 1 << j;
922# ifndef _LIBC
923		if (isascii (ch) && wch != (wchar_t) ch)
924		  dfa->map_notascii = 1;
925# endif
926	      }
927	}
928    }
929#endif
930
931  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
932    return REG_ESPACE;
933  return REG_NOERROR;
934}
935
936/* Initialize WORD_CHAR table, which indicate which character is
937   "word".  In this case "word" means that it is the word construction
938   character used by some operators like "\<", "\>", etc.  */
939
940static void
941init_word_char (dfa)
942     re_dfa_t *dfa;
943{
944  int i, j, ch;
945  dfa->word_ops_used = 1;
946  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
947    for (j = 0; j < UINT_BITS; ++j, ++ch)
948      if (isalnum (ch) || ch == '_')
949	dfa->word_char[i] |= 1 << j;
950}
951
952/* Free the work area which are only used while compiling.  */
953
954static void
955free_workarea_compile (preg)
956     regex_t *preg;
957{
958  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
959  bin_tree_storage_t *storage, *next;
960  for (storage = dfa->str_tree_storage; storage; storage = next)
961    {
962      next = storage->next;
963      re_free (storage);
964    }
965  dfa->str_tree_storage = NULL;
966  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
967  dfa->str_tree = NULL;
968  re_free (dfa->org_indices);
969  dfa->org_indices = NULL;
970}
971
972/* Create initial states for all contexts.  */
973
974static reg_errcode_t
975create_initial_state (dfa)
976     re_dfa_t *dfa;
977{
978  int first, i;
979  reg_errcode_t err;
980  re_node_set init_nodes;
981
982  /* Initial states have the epsilon closure of the node which is
983     the first node of the regular expression.  */
984  first = dfa->str_tree->first->node_idx;
985  dfa->init_node = first;
986  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
987  if (BE (err != REG_NOERROR, 0))
988    return err;
989
990  /* The back-references which are in initial states can epsilon transit,
991     since in this case all of the subexpressions can be null.
992     Then we add epsilon closures of the nodes which are the next nodes of
993     the back-references.  */
994  if (dfa->nbackref > 0)
995    for (i = 0; i < init_nodes.nelem; ++i)
996      {
997	int node_idx = init_nodes.elems[i];
998	re_token_type_t type = dfa->nodes[node_idx].type;
999
1000	int clexp_idx;
1001	if (type != OP_BACK_REF)
1002	  continue;
1003	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1004	  {
1005	    re_token_t *clexp_node;
1006	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1007	    if (clexp_node->type == OP_CLOSE_SUBEXP
1008		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1009	      break;
1010	  }
1011	if (clexp_idx == init_nodes.nelem)
1012	  continue;
1013
1014	if (type == OP_BACK_REF)
1015	  {
1016	    int dest_idx = dfa->edests[node_idx].elems[0];
1017	    if (!re_node_set_contains (&init_nodes, dest_idx))
1018	      {
1019		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1020		i = 0;
1021	      }
1022	  }
1023      }
1024
1025  /* It must be the first time to invoke acquire_state.  */
1026  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1027  /* We don't check ERR here, since the initial state must not be NULL.  */
1028  if (BE (dfa->init_state == NULL, 0))
1029    return err;
1030  if (dfa->init_state->has_constraint)
1031    {
1032      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1033						       CONTEXT_WORD);
1034      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1035						     CONTEXT_NEWLINE);
1036      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1037							 &init_nodes,
1038							 CONTEXT_NEWLINE
1039							 | CONTEXT_BEGBUF);
1040      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1041	      || dfa->init_state_begbuf == NULL, 0))
1042	return err;
1043    }
1044  else
1045    dfa->init_state_word = dfa->init_state_nl
1046      = dfa->init_state_begbuf = dfa->init_state;
1047
1048  re_node_set_free (&init_nodes);
1049  return REG_NOERROR;
1050}
1051
1052#ifdef RE_ENABLE_I18N
1053/* If it is possible to do searching in single byte encoding instead of UTF-8
1054   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1055   DFA nodes where needed.  */
1056
1057static void
1058optimize_utf8 (dfa)
1059     re_dfa_t *dfa;
1060{
1061  int node, i, mb_chars = 0, has_period = 0;
1062
1063  for (node = 0; node < dfa->nodes_len; ++node)
1064    switch (dfa->nodes[node].type)
1065      {
1066      case CHARACTER:
1067	if (dfa->nodes[node].opr.c >= 0x80)
1068	  mb_chars = 1;
1069	break;
1070      case ANCHOR:
1071	switch (dfa->nodes[node].opr.idx)
1072	  {
1073	  case LINE_FIRST:
1074	  case LINE_LAST:
1075	  case BUF_FIRST:
1076	  case BUF_LAST:
1077	    break;
1078	  default:
1079	    /* Word anchors etc. cannot be handled.  */
1080	    return;
1081	  }
1082	break;
1083      case OP_PERIOD:
1084        has_period = 1;
1085        break;
1086      case OP_BACK_REF:
1087      case OP_ALT:
1088      case END_OF_RE:
1089      case OP_DUP_ASTERISK:
1090      case OP_OPEN_SUBEXP:
1091      case OP_CLOSE_SUBEXP:
1092	break;
1093      case COMPLEX_BRACKET:
1094	return;
1095      case SIMPLE_BRACKET:
1096	/* Just double check.  */
1097        for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1098	  if (dfa->nodes[node].opr.sbcset[i])
1099	    return;
1100	break;
1101      default:
1102	abort ();
1103      }
1104
1105  if (mb_chars || has_period)
1106    for (node = 0; node < dfa->nodes_len; ++node)
1107      {
1108	if (dfa->nodes[node].type == CHARACTER
1109	    && dfa->nodes[node].opr.c >= 0x80)
1110	  dfa->nodes[node].mb_partial = 0;
1111	else if (dfa->nodes[node].type == OP_PERIOD)
1112	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1113      }
1114
1115  /* The search can be in single byte locale.  */
1116  dfa->mb_cur_max = 1;
1117  dfa->is_utf8 = 0;
1118  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119}
1120#endif
1121
1122/* Analyze the structure tree, and calculate "first", "next", "edest",
1123   "eclosure", and "inveclosure".  */
1124
1125static reg_errcode_t
1126analyze (preg)
1127     regex_t *preg;
1128{
1129  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1130  reg_errcode_t ret;
1131
1132  /* Allocate arrays.  */
1133  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1134  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1135  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1136  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1137  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1138	  || dfa->eclosures == NULL, 0))
1139    return REG_ESPACE;
1140
1141  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1142  if (dfa->subexp_map != NULL)
1143    {
1144      int i;
1145      for (i = 0; i < preg->re_nsub; i++)
1146	dfa->subexp_map[i] = i;
1147      preorder (dfa->str_tree, optimize_subexps, dfa);
1148      for (i = 0; i < preg->re_nsub; i++)
1149	if (dfa->subexp_map[i] != i)
1150	  break;
1151      if (i == preg->re_nsub)
1152	{
1153	  free (dfa->subexp_map);
1154	  dfa->subexp_map = NULL;
1155	}
1156    }
1157
1158  ret = postorder (dfa->str_tree, lower_subexps, preg);
1159  if (BE (ret != REG_NOERROR, 0))
1160    return ret;
1161  ret = postorder (dfa->str_tree, calc_first, dfa);
1162  if (BE (ret != REG_NOERROR, 0))
1163    return ret;
1164  preorder (dfa->str_tree, calc_next, dfa);
1165  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1166  if (BE (ret != REG_NOERROR, 0))
1167    return ret;
1168  ret = calc_eclosure (dfa);
1169  if (BE (ret != REG_NOERROR, 0))
1170    return ret;
1171
1172  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1173     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1174  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1175      || dfa->nbackref)
1176    {
1177      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1178      if (BE (dfa->inveclosures == NULL, 0))
1179        return REG_ESPACE;
1180      ret = calc_inveclosure (dfa);
1181    }
1182
1183  return ret;
1184}
1185
1186/* Our parse trees are very unbalanced, so we cannot use a stack to
1187   implement parse tree visits.  Instead, we use parent pointers and
1188   some hairy code in these two functions.  */
1189static reg_errcode_t
1190postorder (root, fn, extra)
1191     bin_tree_t *root;
1192     reg_errcode_t (fn (void *, bin_tree_t *));
1193     void *extra;
1194{
1195  bin_tree_t *node, *prev;
1196
1197  for (node = root; ; )
1198    {
1199      /* Descend down the tree, preferably to the left (or to the right
1200	 if that's the only child).  */
1201      while (node->left || node->right)
1202	if (node->left)
1203          node = node->left;
1204        else
1205          node = node->right;
1206
1207      do
1208	{
1209	  reg_errcode_t err = fn (extra, node);
1210	  if (BE (err != REG_NOERROR, 0))
1211	    return err;
1212          if (node->parent == NULL)
1213	    return REG_NOERROR;
1214	  prev = node;
1215	  node = node->parent;
1216	}
1217      /* Go up while we have a node that is reached from the right.  */
1218      while (node->right == prev || node->right == NULL);
1219      node = node->right;
1220    }
1221}
1222
1223static reg_errcode_t
1224preorder (root, fn, extra)
1225     bin_tree_t *root;
1226     reg_errcode_t (fn (void *, bin_tree_t *));
1227     void *extra;
1228{
1229  bin_tree_t *node;
1230
1231  for (node = root; ; )
1232    {
1233      reg_errcode_t err = fn (extra, node);
1234      if (BE (err != REG_NOERROR, 0))
1235	return err;
1236
1237      /* Go to the left node, or up and to the right.  */
1238      if (node->left)
1239	node = node->left;
1240      else
1241	{
1242	  bin_tree_t *prev = NULL;
1243	  while (node->right == prev || node->right == NULL)
1244	    {
1245	      prev = node;
1246	      node = node->parent;
1247	      if (!node)
1248	        return REG_NOERROR;
1249	    }
1250	  node = node->right;
1251	}
1252    }
1253}
1254
1255/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1256   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1257   backreferences as well.  Requires a preorder visit.  */
1258static reg_errcode_t
1259optimize_subexps (extra, node)
1260     void *extra;
1261     bin_tree_t *node;
1262{
1263  re_dfa_t *dfa = (re_dfa_t *) extra;
1264
1265  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1266    {
1267      int idx = node->token.opr.idx;
1268      node->token.opr.idx = dfa->subexp_map[idx];
1269      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1270    }
1271
1272  else if (node->token.type == SUBEXP
1273           && node->left && node->left->token.type == SUBEXP)
1274    {
1275      int other_idx = node->left->token.opr.idx;
1276
1277      node->left = node->left->left;
1278      if (node->left)
1279        node->left->parent = node;
1280
1281      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1282      if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1283	dfa->used_bkref_map &= ~(1 << other_idx);
1284    }
1285
1286  return REG_NOERROR;
1287}
1288
1289/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1290   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1291static reg_errcode_t
1292lower_subexps (extra, node)
1293     void *extra;
1294     bin_tree_t *node;
1295{
1296  regex_t *preg = (regex_t *) extra;
1297  reg_errcode_t err = REG_NOERROR;
1298
1299  if (node->left && node->left->token.type == SUBEXP)
1300    {
1301      node->left = lower_subexp (&err, preg, node->left);
1302      if (node->left)
1303	node->left->parent = node;
1304    }
1305  if (node->right && node->right->token.type == SUBEXP)
1306    {
1307      node->right = lower_subexp (&err, preg, node->right);
1308      if (node->right)
1309	node->right->parent = node;
1310    }
1311
1312  return err;
1313}
1314
1315static bin_tree_t *
1316lower_subexp (err, preg, node)
1317     reg_errcode_t *err;
1318     regex_t *preg;
1319     bin_tree_t *node;
1320{
1321  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1322  bin_tree_t *body = node->left;
1323  bin_tree_t *op, *cls, *tree1, *tree;
1324
1325  if (preg->no_sub
1326      /* We do not optimize empty subexpressions, because otherwise we may
1327	 have bad CONCAT nodes with NULL children.  This is obviously not
1328	 very common, so we do not lose much.  An example that triggers
1329	 this case is the sed "script" /\(\)/x.  */
1330      && node->left != NULL
1331      && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1332	  || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1333    return node->left;
1334
1335  /* Convert the SUBEXP node to the concatenation of an
1336     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1337  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1338  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1339  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1340  tree = create_tree (dfa, op, tree1, CONCAT);
1341  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1342    {
1343      *err = REG_ESPACE;
1344      return NULL;
1345    }
1346
1347  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1348  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1349  return tree;
1350}
1351
1352/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1353   nodes.  Requires a postorder visit.  */
1354static reg_errcode_t
1355calc_first (extra, node)
1356     void *extra;
1357     bin_tree_t *node;
1358{
1359  re_dfa_t *dfa = (re_dfa_t *) extra;
1360  if (node->token.type == CONCAT)
1361    {
1362      node->first = node->left->first;
1363      node->node_idx = node->left->node_idx;
1364    }
1365  else
1366    {
1367      node->first = node;
1368      node->node_idx = re_dfa_add_node (dfa, node->token);
1369      if (BE (node->node_idx == -1, 0))
1370        return REG_ESPACE;
1371    }
1372  return REG_NOERROR;
1373}
1374
1375/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1376static reg_errcode_t
1377calc_next (extra, node)
1378     void *extra;
1379     bin_tree_t *node;
1380{
1381  switch (node->token.type)
1382    {
1383    case OP_DUP_ASTERISK:
1384      node->left->next = node;
1385      break;
1386    case CONCAT:
1387      node->left->next = node->right->first;
1388      node->right->next = node->next;
1389      break;
1390    default:
1391      if (node->left)
1392	node->left->next = node->next;
1393      if (node->right)
1394        node->right->next = node->next;
1395      break;
1396    }
1397  return REG_NOERROR;
1398}
1399
1400/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1401static reg_errcode_t
1402link_nfa_nodes (extra, node)
1403     void *extra;
1404     bin_tree_t *node;
1405{
1406  re_dfa_t *dfa = (re_dfa_t *) extra;
1407  int idx = node->node_idx;
1408  reg_errcode_t err = REG_NOERROR;
1409
1410  switch (node->token.type)
1411    {
1412    case CONCAT:
1413      break;
1414
1415    case END_OF_RE:
1416      assert (node->next == NULL);
1417      break;
1418
1419    case OP_DUP_ASTERISK:
1420    case OP_ALT:
1421      {
1422	int left, right;
1423	dfa->has_plural_match = 1;
1424	if (node->left != NULL)
1425	  left = node->left->first->node_idx;
1426	else
1427	  left = node->next->node_idx;
1428	if (node->right != NULL)
1429	  right = node->right->first->node_idx;
1430	else
1431	  right = node->next->node_idx;
1432	assert (left > -1);
1433	assert (right > -1);
1434	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1435      }
1436      break;
1437
1438    case ANCHOR:
1439    case OP_OPEN_SUBEXP:
1440    case OP_CLOSE_SUBEXP:
1441      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1442      break;
1443
1444    case OP_BACK_REF:
1445      dfa->nexts[idx] = node->next->node_idx;
1446      if (node->token.type == OP_BACK_REF)
1447	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1448      break;
1449
1450    default:
1451      assert (!IS_EPSILON_NODE (node->token.type));
1452      dfa->nexts[idx] = node->next->node_idx;
1453      break;
1454    }
1455
1456  return err;
1457}
1458
1459/* Duplicate the epsilon closure of the node ROOT_NODE.
1460   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1461   to their own constraint.  */
1462
1463static reg_errcode_t
1464duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1465			init_constraint)
1466     re_dfa_t *dfa;
1467     int top_org_node, top_clone_node, root_node;
1468     unsigned int init_constraint;
1469{
1470  reg_errcode_t err;
1471  int org_node, clone_node, ret;
1472  unsigned int constraint = init_constraint;
1473  for (org_node = top_org_node, clone_node = top_clone_node;;)
1474    {
1475      int org_dest, clone_dest;
1476      if (dfa->nodes[org_node].type == OP_BACK_REF)
1477	{
1478	  /* If the back reference epsilon-transit, its destination must
1479	     also have the constraint.  Then duplicate the epsilon closure
1480	     of the destination of the back reference, and store it in
1481	     edests of the back reference.  */
1482	  org_dest = dfa->nexts[org_node];
1483	  re_node_set_empty (dfa->edests + clone_node);
1484	  err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1485	  if (BE (err != REG_NOERROR, 0))
1486	    return err;
1487	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1488	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489	  if (BE (ret < 0, 0))
1490	    return REG_ESPACE;
1491	}
1492      else if (dfa->edests[org_node].nelem == 0)
1493	{
1494	  /* In case of the node can't epsilon-transit, don't duplicate the
1495	     destination and store the original destination as the
1496	     destination of the node.  */
1497	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1498	  break;
1499	}
1500      else if (dfa->edests[org_node].nelem == 1)
1501	{
1502	  /* In case of the node can epsilon-transit, and it has only one
1503	     destination.  */
1504	  org_dest = dfa->edests[org_node].elems[0];
1505	  re_node_set_empty (dfa->edests + clone_node);
1506	  if (dfa->nodes[org_node].type == ANCHOR)
1507	    {
1508	      /* In case of the node has another constraint, append it.  */
1509	      if (org_node == root_node && clone_node != org_node)
1510		{
1511		  /* ...but if the node is root_node itself, it means the
1512		     epsilon closure have a loop, then tie it to the
1513		     destination of the root_node.  */
1514		  ret = re_node_set_insert (dfa->edests + clone_node,
1515					    org_dest);
1516		  if (BE (ret < 0, 0))
1517		    return REG_ESPACE;
1518		  break;
1519		}
1520	      constraint |= dfa->nodes[org_node].opr.ctx_type;
1521	    }
1522	  err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1523	  if (BE (err != REG_NOERROR, 0))
1524	    return err;
1525	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526	  if (BE (ret < 0, 0))
1527	    return REG_ESPACE;
1528	}
1529      else /* dfa->edests[org_node].nelem == 2 */
1530	{
1531	  /* In case of the node can epsilon-transit, and it has two
1532	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1533	  org_dest = dfa->edests[org_node].elems[0];
1534	  re_node_set_empty (dfa->edests + clone_node);
1535	  /* Search for a duplicated node which satisfies the constraint.  */
1536	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1537	  if (clone_dest == -1)
1538	    {
1539	      /* There are no such a duplicated node, create a new one.  */
1540	      err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1541	      if (BE (err != REG_NOERROR, 0))
1542		return err;
1543	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544	      if (BE (ret < 0, 0))
1545		return REG_ESPACE;
1546	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547					    root_node, constraint);
1548	      if (BE (err != REG_NOERROR, 0))
1549		return err;
1550	    }
1551	  else
1552	    {
1553	      /* There are a duplicated node which satisfy the constraint,
1554		 use it to avoid infinite loop.  */
1555	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556	      if (BE (ret < 0, 0))
1557		return REG_ESPACE;
1558	    }
1559
1560	  org_dest = dfa->edests[org_node].elems[1];
1561	  err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1562	  if (BE (err != REG_NOERROR, 0))
1563	    return err;
1564	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565	  if (BE (ret < 0, 0))
1566	    return REG_ESPACE;
1567	}
1568      org_node = org_dest;
1569      clone_node = clone_dest;
1570    }
1571  return REG_NOERROR;
1572}
1573
1574/* Search for a node which is duplicated from the node ORG_NODE, and
1575   satisfies the constraint CONSTRAINT.  */
1576
1577static int
1578search_duplicated_node (dfa, org_node, constraint)
1579     re_dfa_t *dfa;
1580     int org_node;
1581     unsigned int constraint;
1582{
1583  int idx;
1584  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1585    {
1586      if (org_node == dfa->org_indices[idx]
1587	  && constraint == dfa->nodes[idx].constraint)
1588	return idx; /* Found.  */
1589    }
1590  return -1; /* Not found.  */
1591}
1592
1593/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1594   The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1595   otherwise return the error code.  */
1596
1597static reg_errcode_t
1598duplicate_node (new_idx, dfa, org_idx, constraint)
1599     re_dfa_t *dfa;
1600     int *new_idx, org_idx;
1601     unsigned int constraint;
1602{
1603  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1604  if (BE (dup_idx == -1, 0))
1605    return REG_ESPACE;
1606  dfa->nodes[dup_idx].constraint = constraint;
1607  if (dfa->nodes[org_idx].type == ANCHOR)
1608    dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1609  dfa->nodes[dup_idx].duplicated = 1;
1610
1611  /* Store the index of the original node.  */
1612  dfa->org_indices[dup_idx] = org_idx;
1613  *new_idx = dup_idx;
1614  return REG_NOERROR;
1615}
1616
1617static reg_errcode_t
1618calc_inveclosure (dfa)
1619     re_dfa_t *dfa;
1620{
1621  int src, idx, ret;
1622  for (idx = 0; idx < dfa->nodes_len; ++idx)
1623    re_node_set_init_empty (dfa->inveclosures + idx);
1624
1625  for (src = 0; src < dfa->nodes_len; ++src)
1626    {
1627      int *elems = dfa->eclosures[src].elems;
1628      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1629	{
1630	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1631	  if (BE (ret == -1, 0))
1632	    return REG_ESPACE;
1633	}
1634    }
1635
1636  return REG_NOERROR;
1637}
1638
1639/* Calculate "eclosure" for all the node in DFA.  */
1640
1641static reg_errcode_t
1642calc_eclosure (dfa)
1643     re_dfa_t *dfa;
1644{
1645  int node_idx, incomplete;
1646#ifdef DEBUG
1647  assert (dfa->nodes_len > 0);
1648#endif
1649  incomplete = 0;
1650  /* For each nodes, calculate epsilon closure.  */
1651  for (node_idx = 0; ; ++node_idx)
1652    {
1653      reg_errcode_t err;
1654      re_node_set eclosure_elem;
1655      if (node_idx == dfa->nodes_len)
1656	{
1657	  if (!incomplete)
1658	    break;
1659	  incomplete = 0;
1660	  node_idx = 0;
1661	}
1662
1663#ifdef DEBUG
1664      assert (dfa->eclosures[node_idx].nelem != -1);
1665#endif
1666
1667      /* If we have already calculated, skip it.  */
1668      if (dfa->eclosures[node_idx].nelem != 0)
1669	continue;
1670      /* Calculate epsilon closure of `node_idx'.  */
1671      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1672      if (BE (err != REG_NOERROR, 0))
1673	return err;
1674
1675      if (dfa->eclosures[node_idx].nelem == 0)
1676	{
1677	  incomplete = 1;
1678	  re_node_set_free (&eclosure_elem);
1679	}
1680    }
1681  return REG_NOERROR;
1682}
1683
1684/* Calculate epsilon closure of NODE.  */
1685
1686static reg_errcode_t
1687calc_eclosure_iter (new_set, dfa, node, root)
1688     re_node_set *new_set;
1689     re_dfa_t *dfa;
1690     int node, root;
1691{
1692  reg_errcode_t err;
1693  unsigned int constraint;
1694  int i, incomplete;
1695  re_node_set eclosure;
1696  incomplete = 0;
1697  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1698  if (BE (err != REG_NOERROR, 0))
1699    return err;
1700
1701  /* This indicates that we are calculating this node now.
1702     We reference this value to avoid infinite loop.  */
1703  dfa->eclosures[node].nelem = -1;
1704
1705  constraint = ((dfa->nodes[node].type == ANCHOR)
1706		? dfa->nodes[node].opr.ctx_type : 0);
1707  /* If the current node has constraints, duplicate all nodes.
1708     Since they must inherit the constraints.  */
1709  if (constraint
1710      && dfa->edests[node].nelem
1711      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1712    {
1713      int org_node, cur_node;
1714      org_node = cur_node = node;
1715      err = duplicate_node_closure (dfa, node, node, node, constraint);
1716      if (BE (err != REG_NOERROR, 0))
1717	return err;
1718    }
1719
1720  /* Expand each epsilon destination nodes.  */
1721  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1722    for (i = 0; i < dfa->edests[node].nelem; ++i)
1723      {
1724	re_node_set eclosure_elem;
1725	int edest = dfa->edests[node].elems[i];
1726	/* If calculating the epsilon closure of `edest' is in progress,
1727	   return intermediate result.  */
1728	if (dfa->eclosures[edest].nelem == -1)
1729	  {
1730	    incomplete = 1;
1731	    continue;
1732	  }
1733	/* If we haven't calculated the epsilon closure of `edest' yet,
1734	   calculate now. Otherwise use calculated epsilon closure.  */
1735	if (dfa->eclosures[edest].nelem == 0)
1736	  {
1737	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1738	    if (BE (err != REG_NOERROR, 0))
1739	      return err;
1740	  }
1741	else
1742	  eclosure_elem = dfa->eclosures[edest];
1743	/* Merge the epsilon closure of `edest'.  */
1744	re_node_set_merge (&eclosure, &eclosure_elem);
1745	/* If the epsilon closure of `edest' is incomplete,
1746	   the epsilon closure of this node is also incomplete.  */
1747	if (dfa->eclosures[edest].nelem == 0)
1748	  {
1749	    incomplete = 1;
1750	    re_node_set_free (&eclosure_elem);
1751	  }
1752      }
1753
1754  /* Epsilon closures include itself.  */
1755  re_node_set_insert (&eclosure, node);
1756  if (incomplete && !root)
1757    dfa->eclosures[node].nelem = 0;
1758  else
1759    dfa->eclosures[node] = eclosure;
1760  *new_set = eclosure;
1761  return REG_NOERROR;
1762}
1763
1764/* Functions for token which are used in the parser.  */
1765
1766/* Fetch a token from INPUT.
1767   We must not use this function inside bracket expressions.  */
1768
1769static void
1770fetch_token (result, input, syntax)
1771     re_token_t *result;
1772     re_string_t *input;
1773     reg_syntax_t syntax;
1774{
1775  re_string_skip_bytes (input, peek_token (result, input, syntax));
1776}
1777
1778/* Peek a token from INPUT, and return the length of the token.
1779   We must not use this function inside bracket expressions.  */
1780
1781static int
1782peek_token (token, input, syntax)
1783     re_token_t *token;
1784     re_string_t *input;
1785     reg_syntax_t syntax;
1786{
1787  unsigned char c;
1788
1789  if (re_string_eoi (input))
1790    {
1791      token->type = END_OF_RE;
1792      return 0;
1793    }
1794
1795  c = re_string_peek_byte (input, 0);
1796  token->opr.c = c;
1797
1798  token->word_char = 0;
1799#ifdef RE_ENABLE_I18N
1800  token->mb_partial = 0;
1801  if (input->mb_cur_max > 1 &&
1802      !re_string_first_byte (input, re_string_cur_idx (input)))
1803    {
1804      token->type = CHARACTER;
1805      token->mb_partial = 1;
1806      return 1;
1807    }
1808#endif
1809  if (c == '\\')
1810    {
1811      unsigned char c2;
1812      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1813	{
1814	  token->type = BACK_SLASH;
1815	  return 1;
1816	}
1817
1818      c2 = re_string_peek_byte_case (input, 1);
1819      token->opr.c = c2;
1820      token->type = CHARACTER;
1821#ifdef RE_ENABLE_I18N
1822      if (input->mb_cur_max > 1)
1823	{
1824	  wint_t wc = re_string_wchar_at (input,
1825					  re_string_cur_idx (input) + 1);
1826	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1827	}
1828      else
1829#endif
1830	token->word_char = IS_WORD_CHAR (c2) != 0;
1831
1832      switch (c2)
1833	{
1834	case '|':
1835	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1836	    token->type = OP_ALT;
1837	  break;
1838	case '1': case '2': case '3': case '4': case '5':
1839	case '6': case '7': case '8': case '9':
1840	  if (!(syntax & RE_NO_BK_REFS))
1841	    {
1842	      token->type = OP_BACK_REF;
1843	      token->opr.idx = c2 - '1';
1844	    }
1845	  break;
1846	case '<':
1847	  if (!(syntax & RE_NO_GNU_OPS))
1848	    {
1849	      token->type = ANCHOR;
1850	      token->opr.ctx_type = WORD_FIRST;
1851	    }
1852	  break;
1853	case '>':
1854	  if (!(syntax & RE_NO_GNU_OPS))
1855	    {
1856	      token->type = ANCHOR;
1857	      token->opr.ctx_type = WORD_LAST;
1858	    }
1859	  break;
1860	case 'b':
1861	  if (!(syntax & RE_NO_GNU_OPS))
1862	    {
1863	      token->type = ANCHOR;
1864	      token->opr.ctx_type = WORD_DELIM;
1865	    }
1866	  break;
1867	case 'B':
1868	  if (!(syntax & RE_NO_GNU_OPS))
1869	    {
1870	      token->type = ANCHOR;
1871	      token->opr.ctx_type = NOT_WORD_DELIM;
1872	    }
1873	  break;
1874	case 'w':
1875	  if (!(syntax & RE_NO_GNU_OPS))
1876	    token->type = OP_WORD;
1877	  break;
1878	case 'W':
1879	  if (!(syntax & RE_NO_GNU_OPS))
1880	    token->type = OP_NOTWORD;
1881	  break;
1882	case 's':
1883	  if (!(syntax & RE_NO_GNU_OPS))
1884	    token->type = OP_SPACE;
1885	  break;
1886	case 'S':
1887	  if (!(syntax & RE_NO_GNU_OPS))
1888	    token->type = OP_NOTSPACE;
1889	  break;
1890	case '`':
1891	  if (!(syntax & RE_NO_GNU_OPS))
1892	    {
1893	      token->type = ANCHOR;
1894	      token->opr.ctx_type = BUF_FIRST;
1895	    }
1896	  break;
1897	case '\'':
1898	  if (!(syntax & RE_NO_GNU_OPS))
1899	    {
1900	      token->type = ANCHOR;
1901	      token->opr.ctx_type = BUF_LAST;
1902	    }
1903	  break;
1904	case '(':
1905	  if (!(syntax & RE_NO_BK_PARENS))
1906	    token->type = OP_OPEN_SUBEXP;
1907	  break;
1908	case ')':
1909	  if (!(syntax & RE_NO_BK_PARENS))
1910	    token->type = OP_CLOSE_SUBEXP;
1911	  break;
1912	case '+':
1913	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914	    token->type = OP_DUP_PLUS;
1915	  break;
1916	case '?':
1917	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1918	    token->type = OP_DUP_QUESTION;
1919	  break;
1920	case '{':
1921	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922	    token->type = OP_OPEN_DUP_NUM;
1923	  break;
1924	case '}':
1925	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1926	    token->type = OP_CLOSE_DUP_NUM;
1927	  break;
1928	default:
1929	  break;
1930	}
1931      return 2;
1932    }
1933
1934  token->type = CHARACTER;
1935#ifdef RE_ENABLE_I18N
1936  if (input->mb_cur_max > 1)
1937    {
1938      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1939      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1940    }
1941  else
1942#endif
1943    token->word_char = IS_WORD_CHAR (token->opr.c);
1944
1945  switch (c)
1946    {
1947    case '\n':
1948      if (syntax & RE_NEWLINE_ALT)
1949	token->type = OP_ALT;
1950      break;
1951    case '|':
1952      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1953	token->type = OP_ALT;
1954      break;
1955    case '*':
1956      token->type = OP_DUP_ASTERISK;
1957      break;
1958    case '+':
1959      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960	token->type = OP_DUP_PLUS;
1961      break;
1962    case '?':
1963      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1964	token->type = OP_DUP_QUESTION;
1965      break;
1966    case '{':
1967      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968	token->type = OP_OPEN_DUP_NUM;
1969      break;
1970    case '}':
1971      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1972	token->type = OP_CLOSE_DUP_NUM;
1973      break;
1974    case '(':
1975      if (syntax & RE_NO_BK_PARENS)
1976	token->type = OP_OPEN_SUBEXP;
1977      break;
1978    case ')':
1979      if (syntax & RE_NO_BK_PARENS)
1980	token->type = OP_CLOSE_SUBEXP;
1981      break;
1982    case '[':
1983      token->type = OP_OPEN_BRACKET;
1984      break;
1985    case '.':
1986      token->type = OP_PERIOD;
1987      break;
1988    case '^':
1989      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1990	  re_string_cur_idx (input) != 0)
1991	{
1992	  char prev = re_string_peek_byte (input, -1);
1993	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1994	    break;
1995	}
1996      token->type = ANCHOR;
1997      token->opr.ctx_type = LINE_FIRST;
1998      break;
1999    case '$':
2000      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2001	  re_string_cur_idx (input) + 1 != re_string_length (input))
2002	{
2003	  re_token_t next;
2004	  re_string_skip_bytes (input, 1);
2005	  peek_token (&next, input, syntax);
2006	  re_string_skip_bytes (input, -1);
2007	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2008	    break;
2009	}
2010      token->type = ANCHOR;
2011      token->opr.ctx_type = LINE_LAST;
2012      break;
2013    default:
2014      break;
2015    }
2016  return 1;
2017}
2018
2019/* Peek a token from INPUT, and return the length of the token.
2020   We must not use this function out of bracket expressions.  */
2021
2022static int
2023peek_token_bracket (token, input, syntax)
2024     re_token_t *token;
2025     re_string_t *input;
2026     reg_syntax_t syntax;
2027{
2028  unsigned char c;
2029  if (re_string_eoi (input))
2030    {
2031      token->type = END_OF_RE;
2032      return 0;
2033    }
2034  c = re_string_peek_byte (input, 0);
2035  token->opr.c = c;
2036
2037#ifdef RE_ENABLE_I18N
2038  if (input->mb_cur_max > 1 &&
2039      !re_string_first_byte (input, re_string_cur_idx (input)))
2040    {
2041      token->type = CHARACTER;
2042      return 1;
2043    }
2044#endif /* RE_ENABLE_I18N */
2045
2046  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2047      && re_string_cur_idx (input) + 1 < re_string_length (input))
2048    {
2049      /* In this case, '\' escape a character.  */
2050      unsigned char c2;
2051      re_string_skip_bytes (input, 1);
2052      c2 = re_string_peek_byte (input, 0);
2053      token->opr.c = c2;
2054      token->type = CHARACTER;
2055      return 1;
2056    }
2057  if (c == '[') /* '[' is a special char in a bracket exps.  */
2058    {
2059      unsigned char c2;
2060      int token_len;
2061      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2062	c2 = re_string_peek_byte (input, 1);
2063      else
2064	c2 = 0;
2065      token->opr.c = c2;
2066      token_len = 2;
2067      switch (c2)
2068	{
2069	case '.':
2070	  token->type = OP_OPEN_COLL_ELEM;
2071	  break;
2072	case '=':
2073	  token->type = OP_OPEN_EQUIV_CLASS;
2074	  break;
2075	case ':':
2076	  if (syntax & RE_CHAR_CLASSES)
2077	    {
2078	      token->type = OP_OPEN_CHAR_CLASS;
2079	      break;
2080	    }
2081	  /* else fall through.  */
2082	default:
2083	  token->type = CHARACTER;
2084	  token->opr.c = c;
2085	  token_len = 1;
2086	  break;
2087	}
2088      return token_len;
2089    }
2090  switch (c)
2091    {
2092    case '-':
2093      token->type = OP_CHARSET_RANGE;
2094      break;
2095    case ']':
2096      token->type = OP_CLOSE_BRACKET;
2097      break;
2098    case '^':
2099      token->type = OP_NON_MATCH_LIST;
2100      break;
2101    default:
2102      token->type = CHARACTER;
2103    }
2104  return 1;
2105}
2106
2107/* Functions for parser.  */
2108
2109/* Entry point of the parser.
2110   Parse the regular expression REGEXP and return the structure tree.
2111   If an error is occured, ERR is set by error code, and return NULL.
2112   This function build the following tree, from regular expression <reg_exp>:
2113	   CAT
2114	   / \
2115	  /   \
2116   <reg_exp>  EOR
2117
2118   CAT means concatenation.
2119   EOR means end of regular expression.  */
2120
2121static bin_tree_t *
2122parse (regexp, preg, syntax, err)
2123     re_string_t *regexp;
2124     regex_t *preg;
2125     reg_syntax_t syntax;
2126     reg_errcode_t *err;
2127{
2128  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2129  bin_tree_t *tree, *eor, *root;
2130  re_token_t current_token;
2131  dfa->syntax = syntax;
2132  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2133  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2134  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2135    return NULL;
2136  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2137  if (tree != NULL)
2138    root = create_tree (dfa, tree, eor, CONCAT);
2139  else
2140    root = eor;
2141  if (BE (eor == NULL || root == NULL, 0))
2142    {
2143      *err = REG_ESPACE;
2144      return NULL;
2145    }
2146  return root;
2147}
2148
2149/* This function build the following tree, from regular expression
2150   <branch1>|<branch2>:
2151	   ALT
2152	   / \
2153	  /   \
2154   <branch1> <branch2>
2155
2156   ALT means alternative, which represents the operator `|'.  */
2157
2158static bin_tree_t *
2159parse_reg_exp (regexp, preg, token, syntax, nest, err)
2160     re_string_t *regexp;
2161     regex_t *preg;
2162     re_token_t *token;
2163     reg_syntax_t syntax;
2164     int nest;
2165     reg_errcode_t *err;
2166{
2167  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2168  bin_tree_t *tree, *branch = NULL;
2169  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2170  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2171    return NULL;
2172
2173  while (token->type == OP_ALT)
2174    {
2175      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2176      if (token->type != OP_ALT && token->type != END_OF_RE
2177	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2178	{
2179	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2180	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2181	    return NULL;
2182	}
2183      else
2184	branch = NULL;
2185      tree = create_tree (dfa, tree, branch, OP_ALT);
2186      if (BE (tree == NULL, 0))
2187	{
2188	  *err = REG_ESPACE;
2189	  return NULL;
2190	}
2191    }
2192  return tree;
2193}
2194
2195/* This function build the following tree, from regular expression
2196   <exp1><exp2>:
2197	CAT
2198	/ \
2199       /   \
2200   <exp1> <exp2>
2201
2202   CAT means concatenation.  */
2203
2204static bin_tree_t *
2205parse_branch (regexp, preg, token, syntax, nest, err)
2206     re_string_t *regexp;
2207     regex_t *preg;
2208     re_token_t *token;
2209     reg_syntax_t syntax;
2210     int nest;
2211     reg_errcode_t *err;
2212{
2213  bin_tree_t *tree, *exp;
2214  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2215  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2216  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2217    return NULL;
2218
2219  while (token->type != OP_ALT && token->type != END_OF_RE
2220	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2221    {
2222      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2223      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2224	{
2225	  return NULL;
2226	}
2227      if (tree != NULL && exp != NULL)
2228	{
2229	  tree = create_tree (dfa, tree, exp, CONCAT);
2230	  if (tree == NULL)
2231	    {
2232	      *err = REG_ESPACE;
2233	      return NULL;
2234	    }
2235	}
2236      else if (tree == NULL)
2237	tree = exp;
2238      /* Otherwise exp == NULL, we don't need to create new tree.  */
2239    }
2240  return tree;
2241}
2242
2243/* This function build the following tree, from regular expression a*:
2244	 *
2245	 |
2246	 a
2247*/
2248
2249static bin_tree_t *
2250parse_expression (regexp, preg, token, syntax, nest, err)
2251     re_string_t *regexp;
2252     regex_t *preg;
2253     re_token_t *token;
2254     reg_syntax_t syntax;
2255     int nest;
2256     reg_errcode_t *err;
2257{
2258  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2259  bin_tree_t *tree;
2260  switch (token->type)
2261    {
2262    case CHARACTER:
2263      tree = create_token_tree (dfa, NULL, NULL, token);
2264      if (BE (tree == NULL, 0))
2265	{
2266	  *err = REG_ESPACE;
2267	  return NULL;
2268	}
2269#ifdef RE_ENABLE_I18N
2270      if (dfa->mb_cur_max > 1)
2271	{
2272	  while (!re_string_eoi (regexp)
2273		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2274	    {
2275	      bin_tree_t *mbc_remain;
2276	      fetch_token (token, regexp, syntax);
2277	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2278	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2279	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2280		{
2281		  *err = REG_ESPACE;
2282		  return NULL;
2283		}
2284	    }
2285	}
2286#endif
2287      break;
2288    case OP_OPEN_SUBEXP:
2289      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2290      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2291	return NULL;
2292      break;
2293    case OP_OPEN_BRACKET:
2294      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2295      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2296	return NULL;
2297      break;
2298    case OP_BACK_REF:
2299      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2300	{
2301	  *err = REG_ESUBREG;
2302	  return NULL;
2303	}
2304      dfa->used_bkref_map |= 1 << token->opr.idx;
2305      tree = create_token_tree (dfa, NULL, NULL, token);
2306      if (BE (tree == NULL, 0))
2307	{
2308	  *err = REG_ESPACE;
2309	  return NULL;
2310	}
2311      ++dfa->nbackref;
2312      dfa->has_mb_node = 1;
2313      break;
2314    case OP_OPEN_DUP_NUM:
2315      if (syntax & RE_CONTEXT_INVALID_DUP)
2316	{
2317	  *err = REG_BADRPT;
2318	  return NULL;
2319	}
2320      /* FALLTHROUGH */
2321    case OP_DUP_ASTERISK:
2322    case OP_DUP_PLUS:
2323    case OP_DUP_QUESTION:
2324      if (syntax & RE_CONTEXT_INVALID_OPS)
2325	{
2326	  *err = REG_BADRPT;
2327	  return NULL;
2328	}
2329      else if (syntax & RE_CONTEXT_INDEP_OPS)
2330	{
2331	  fetch_token (token, regexp, syntax);
2332	  return parse_expression (regexp, preg, token, syntax, nest, err);
2333	}
2334      /* else fall through  */
2335    case OP_CLOSE_SUBEXP:
2336      if ((token->type == OP_CLOSE_SUBEXP) &&
2337	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2338	{
2339	  *err = REG_ERPAREN;
2340	  return NULL;
2341	}
2342      /* else fall through  */
2343    case OP_CLOSE_DUP_NUM:
2344      /* We treat it as a normal character.  */
2345
2346      /* Then we can these characters as normal characters.  */
2347      token->type = CHARACTER;
2348      /* mb_partial and word_char bits should be initialized already
2349	 by peek_token.  */
2350      tree = create_token_tree (dfa, NULL, NULL, token);
2351      if (BE (tree == NULL, 0))
2352	{
2353	  *err = REG_ESPACE;
2354	  return NULL;
2355	}
2356      break;
2357    case ANCHOR:
2358      if ((token->opr.ctx_type
2359	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2360	  && dfa->word_ops_used == 0)
2361	init_word_char (dfa);
2362      if (token->opr.ctx_type == WORD_DELIM
2363          || token->opr.ctx_type == NOT_WORD_DELIM)
2364	{
2365	  bin_tree_t *tree_first, *tree_last;
2366	  if (token->opr.ctx_type == WORD_DELIM)
2367	    {
2368	      token->opr.ctx_type = WORD_FIRST;
2369	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2370	      token->opr.ctx_type = WORD_LAST;
2371            }
2372          else
2373            {
2374	      token->opr.ctx_type = INSIDE_WORD;
2375	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2376	      token->opr.ctx_type = INSIDE_NOTWORD;
2377            }
2378	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2379	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2380	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2381	    {
2382	      *err = REG_ESPACE;
2383	      return NULL;
2384	    }
2385	}
2386      else
2387	{
2388	  tree = create_token_tree (dfa, NULL, NULL, token);
2389	  if (BE (tree == NULL, 0))
2390	    {
2391	      *err = REG_ESPACE;
2392	      return NULL;
2393	    }
2394	}
2395      /* We must return here, since ANCHORs can't be followed
2396	 by repetition operators.
2397	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2398	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2399      fetch_token (token, regexp, syntax);
2400      return tree;
2401    case OP_PERIOD:
2402      tree = create_token_tree (dfa, NULL, NULL, token);
2403      if (BE (tree == NULL, 0))
2404	{
2405	  *err = REG_ESPACE;
2406	  return NULL;
2407	}
2408      if (dfa->mb_cur_max > 1)
2409	dfa->has_mb_node = 1;
2410      break;
2411    case OP_WORD:
2412    case OP_NOTWORD:
2413      tree = build_charclass_op (dfa, regexp->trans,
2414				 (const unsigned char *) "alnum",
2415				 (const unsigned char *) "_",
2416				 token->type == OP_NOTWORD, err);
2417      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418	return NULL;
2419      break;
2420    case OP_SPACE:
2421    case OP_NOTSPACE:
2422      tree = build_charclass_op (dfa, regexp->trans,
2423				 (const unsigned char *) "space",
2424				 (const unsigned char *) "",
2425				 token->type == OP_NOTSPACE, err);
2426      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2427	return NULL;
2428      break;
2429    case OP_ALT:
2430    case END_OF_RE:
2431      return NULL;
2432    case BACK_SLASH:
2433      *err = REG_EESCAPE;
2434      return NULL;
2435    default:
2436      /* Must not happen?  */
2437#ifdef DEBUG
2438      assert (0);
2439#endif
2440      return NULL;
2441    }
2442  fetch_token (token, regexp, syntax);
2443
2444  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2445	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2446    {
2447      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2448      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2449	return NULL;
2450      /* In BRE consecutive duplications are not allowed.  */
2451      if ((syntax & RE_CONTEXT_INVALID_DUP)
2452	  && (token->type == OP_DUP_ASTERISK
2453	      || token->type == OP_OPEN_DUP_NUM))
2454	{
2455	  *err = REG_BADRPT;
2456	  return NULL;
2457	}
2458    }
2459
2460  return tree;
2461}
2462
2463/* This function build the following tree, from regular expression
2464   (<reg_exp>):
2465	 SUBEXP
2466	    |
2467	<reg_exp>
2468*/
2469
2470static bin_tree_t *
2471parse_sub_exp (regexp, preg, token, syntax, nest, err)
2472     re_string_t *regexp;
2473     regex_t *preg;
2474     re_token_t *token;
2475     reg_syntax_t syntax;
2476     int nest;
2477     reg_errcode_t *err;
2478{
2479  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2480  bin_tree_t *tree;
2481  size_t cur_nsub;
2482  cur_nsub = preg->re_nsub++;
2483
2484  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2485
2486  /* The subexpression may be a null string.  */
2487  if (token->type == OP_CLOSE_SUBEXP)
2488    tree = NULL;
2489  else
2490    {
2491      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2492      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2493        *err = REG_EPAREN;
2494      if (BE (*err != REG_NOERROR, 0))
2495	return NULL;
2496    }
2497  dfa->completed_bkref_map |= 1 << cur_nsub;
2498
2499  tree = create_tree (dfa, tree, NULL, SUBEXP);
2500  if (BE (tree == NULL, 0))
2501    {
2502      *err = REG_ESPACE;
2503      return NULL;
2504    }
2505  tree->token.opr.idx = cur_nsub;
2506  return tree;
2507}
2508
2509/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2510
2511static bin_tree_t *
2512parse_dup_op (elem, regexp, dfa, token, syntax, err)
2513     bin_tree_t *elem;
2514     re_string_t *regexp;
2515     re_dfa_t *dfa;
2516     re_token_t *token;
2517     reg_syntax_t syntax;
2518     reg_errcode_t *err;
2519{
2520  bin_tree_t *tree = NULL, *old_tree = NULL;
2521  int i, start, end, start_idx = re_string_cur_idx (regexp);
2522  re_token_t start_token = *token;
2523
2524  if (token->type == OP_OPEN_DUP_NUM)
2525    {
2526      end = 0;
2527      start = fetch_number (regexp, token, syntax);
2528      if (start == -1)
2529	{
2530	  if (token->type == CHARACTER && token->opr.c == ',')
2531	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2532	  else
2533	    {
2534	      *err = REG_BADBR; /* <re>{} is invalid.  */
2535	      return NULL;
2536	    }
2537	}
2538      if (BE (start != -2, 1))
2539	{
2540	  /* We treat "{n}" as "{n,n}".  */
2541	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2542		 : ((token->type == CHARACTER && token->opr.c == ',')
2543		    ? fetch_number (regexp, token, syntax) : -2));
2544	}
2545      if (BE (start == -2 || end == -2, 0))
2546	{
2547	  /* Invalid sequence.  */
2548	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2549	    {
2550	      if (token->type == END_OF_RE)
2551		*err = REG_EBRACE;
2552	      else
2553		*err = REG_BADBR;
2554
2555	      return NULL;
2556	    }
2557
2558	  /* If the syntax bit is set, rollback.  */
2559	  re_string_set_index (regexp, start_idx);
2560	  *token = start_token;
2561	  token->type = CHARACTER;
2562	  /* mb_partial and word_char bits should be already initialized by
2563	     peek_token.  */
2564	  return elem;
2565	}
2566
2567      if (BE (end != -1 && start > end, 0))
2568	{
2569	  /* First number greater than second.  */
2570	  *err = REG_BADBR;
2571	  return NULL;
2572	}
2573    }
2574  else
2575    {
2576      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2577      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2578    }
2579
2580  fetch_token (token, regexp, syntax);
2581
2582  if (BE (elem == NULL, 0))
2583    return NULL;
2584  if (BE (start == 0 && end == 0, 0))
2585    {
2586      postorder (elem, free_tree, NULL);
2587      return NULL;
2588    }
2589
2590  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2591  if (BE (start > 0, 0))
2592    {
2593      tree = elem;
2594      for (i = 2; i <= start; ++i)
2595	{
2596	  elem = duplicate_tree (elem, dfa);
2597	  tree = create_tree (dfa, tree, elem, CONCAT);
2598	  if (BE (elem == NULL || tree == NULL, 0))
2599	    goto parse_dup_op_espace;
2600	}
2601
2602      if (start == end)
2603	return tree;
2604
2605      /* Duplicate ELEM before it is marked optional.  */
2606      elem = duplicate_tree (elem, dfa);
2607      old_tree = tree;
2608    }
2609  else
2610    old_tree = NULL;
2611
2612  if (elem->token.type == SUBEXP)
2613    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2614
2615  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2616  if (BE (tree == NULL, 0))
2617    goto parse_dup_op_espace;
2618
2619  /* This loop is actually executed only when end != -1,
2620     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2621     already created the start+1-th copy.  */
2622  for (i = start + 2; i <= end; ++i)
2623    {
2624      elem = duplicate_tree (elem, dfa);
2625      tree = create_tree (dfa, tree, elem, CONCAT);
2626      if (BE (elem == NULL || tree == NULL, 0))
2627        goto parse_dup_op_espace;
2628
2629      tree = create_tree (dfa, tree, NULL, OP_ALT);
2630      if (BE (tree == NULL, 0))
2631        goto parse_dup_op_espace;
2632    }
2633
2634  if (old_tree)
2635    tree = create_tree (dfa, old_tree, tree, CONCAT);
2636
2637  return tree;
2638
2639 parse_dup_op_espace:
2640  *err = REG_ESPACE;
2641  return NULL;
2642}
2643
2644/* Size of the names for collating symbol/equivalence_class/character_class.
2645   I'm not sure, but maybe enough.  */
2646#define BRACKET_NAME_BUF_SIZE 32
2647
2648#ifndef _LIBC
2649  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2650     Build the range expression which starts from START_ELEM, and ends
2651     at END_ELEM.  The result are written to MBCSET and SBCSET.
2652     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2653     mbcset->range_ends, is a pointer argument sinse we may
2654     update it.  */
2655
2656static reg_errcode_t
2657# ifdef RE_ENABLE_I18N
2658build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2659     re_charset_t *mbcset;
2660     int *range_alloc;
2661# else /* not RE_ENABLE_I18N */
2662build_range_exp (sbcset, start_elem, end_elem)
2663# endif /* not RE_ENABLE_I18N */
2664     re_bitset_ptr_t sbcset;
2665     bracket_elem_t *start_elem, *end_elem;
2666{
2667  unsigned int start_ch, end_ch;
2668  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2669  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2670	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2671	  0))
2672    return REG_ERANGE;
2673
2674  /* We can handle no multi character collating elements without libc
2675     support.  */
2676  if (BE ((start_elem->type == COLL_SYM
2677	   && strlen ((char *) start_elem->opr.name) > 1)
2678	  || (end_elem->type == COLL_SYM
2679	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2680    return REG_ECOLLATE;
2681
2682# ifdef RE_ENABLE_I18N
2683  {
2684    wchar_t wc, start_wc, end_wc;
2685    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2686
2687    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2688		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2689		   : 0));
2690    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2691	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2692		 : 0));
2693    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2694		? __btowc (start_ch) : start_elem->opr.wch);
2695    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2696	      ? __btowc (end_ch) : end_elem->opr.wch);
2697    if (start_wc == WEOF || end_wc == WEOF)
2698      return REG_ECOLLATE;
2699    cmp_buf[0] = start_wc;
2700    cmp_buf[4] = end_wc;
2701    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2702      return REG_ERANGE;
2703
2704    /* Got valid collation sequence values, add them as a new entry.
2705       However, for !_LIBC we have no collation elements: if the
2706       character set is single byte, the single byte character set
2707       that we build below suffices.  parse_bracket_exp passes
2708       no MBCSET if dfa->mb_cur_max == 1.  */
2709    if (mbcset)
2710      {
2711        /* Check the space of the arrays.  */
2712        if (BE (*range_alloc == mbcset->nranges, 0))
2713          {
2714	    /* There is not enough space, need realloc.  */
2715	    wchar_t *new_array_start, *new_array_end;
2716	    int new_nranges;
2717
2718	    /* +1 in case of mbcset->nranges is 0.  */
2719	    new_nranges = 2 * mbcset->nranges + 1;
2720	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2721	       are NULL if *range_alloc == 0.  */
2722	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2723				          new_nranges);
2724	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2725				        new_nranges);
2726
2727	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2728	      return REG_ESPACE;
2729
2730	    mbcset->range_starts = new_array_start;
2731	    mbcset->range_ends = new_array_end;
2732	    *range_alloc = new_nranges;
2733          }
2734
2735        mbcset->range_starts[mbcset->nranges] = start_wc;
2736        mbcset->range_ends[mbcset->nranges++] = end_wc;
2737      }
2738
2739    /* Build the table for single byte characters.  */
2740    for (wc = 0; wc < SBC_MAX; ++wc)
2741      {
2742	cmp_buf[2] = wc;
2743	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2744	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2745	  bitset_set (sbcset, wc);
2746      }
2747  }
2748# else /* not RE_ENABLE_I18N */
2749  {
2750    unsigned int ch;
2751    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2752		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2753		   : 0));
2754    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2755	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2756		 : 0));
2757    if (start_ch > end_ch)
2758      return REG_ERANGE;
2759    /* Build the table for single byte characters.  */
2760    for (ch = 0; ch < SBC_MAX; ++ch)
2761      if (start_ch <= ch  && ch <= end_ch)
2762	bitset_set (sbcset, ch);
2763  }
2764# endif /* not RE_ENABLE_I18N */
2765  return REG_NOERROR;
2766}
2767#endif /* not _LIBC */
2768
2769#ifndef _LIBC
2770/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2771   Build the collating element which is represented by NAME.
2772   The result are written to MBCSET and SBCSET.
2773   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2774   pointer argument since we may update it.  */
2775
2776static reg_errcode_t
2777# ifdef RE_ENABLE_I18N
2778build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2779     re_charset_t *mbcset;
2780     int *coll_sym_alloc;
2781# else /* not RE_ENABLE_I18N */
2782build_collating_symbol (sbcset, name)
2783# endif /* not RE_ENABLE_I18N */
2784     re_bitset_ptr_t sbcset;
2785     const unsigned char *name;
2786{
2787  size_t name_len = strlen ((const char *) name);
2788  if (BE (name_len != 1, 0))
2789    return REG_ECOLLATE;
2790  else
2791    {
2792      bitset_set (sbcset, name[0]);
2793      return REG_NOERROR;
2794    }
2795}
2796#endif /* not _LIBC */
2797
2798/* This function parse bracket expression like "[abc]", "[a-c]",
2799   "[[.a-a.]]" etc.  */
2800
2801static bin_tree_t *
2802parse_bracket_exp (regexp, dfa, token, syntax, err)
2803     re_string_t *regexp;
2804     re_dfa_t *dfa;
2805     re_token_t *token;
2806     reg_syntax_t syntax;
2807     reg_errcode_t *err;
2808{
2809#ifdef _LIBC
2810  const unsigned char *collseqmb;
2811  const char *collseqwc;
2812  uint32_t nrules;
2813  int32_t table_size;
2814  const int32_t *symb_table;
2815  const unsigned char *extra;
2816
2817  /* Local function for parse_bracket_exp used in _LIBC environement.
2818     Seek the collating symbol entry correspondings to NAME.
2819     Return the index of the symbol in the SYMB_TABLE.  */
2820
2821  auto inline int32_t
2822  __attribute ((always_inline))
2823  seek_collating_symbol_entry (name, name_len)
2824	 const unsigned char *name;
2825	 size_t name_len;
2826    {
2827      int32_t hash = elem_hash ((const char *) name, name_len);
2828      int32_t elem = hash % table_size;
2829      int32_t second = hash % (table_size - 2);
2830      while (symb_table[2 * elem] != 0)
2831	{
2832	  /* First compare the hashing value.  */
2833	  if (symb_table[2 * elem] == hash
2834	      /* Compare the length of the name.  */
2835	      && name_len == extra[symb_table[2 * elem + 1]]
2836	      /* Compare the name.  */
2837	      && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2838			 name_len) == 0)
2839	    {
2840	      /* Yep, this is the entry.  */
2841	      break;
2842	    }
2843
2844	  /* Next entry.  */
2845	  elem += second;
2846	}
2847      return elem;
2848    }
2849
2850  /* Local function for parse_bracket_exp used in _LIBC environement.
2851     Look up the collation sequence value of BR_ELEM.
2852     Return the value if succeeded, UINT_MAX otherwise.  */
2853
2854  auto inline unsigned int
2855  __attribute ((always_inline))
2856  lookup_collation_sequence_value (br_elem)
2857	 bracket_elem_t *br_elem;
2858    {
2859      if (br_elem->type == SB_CHAR)
2860	{
2861	  /*
2862	  if (MB_CUR_MAX == 1)
2863	  */
2864	  if (nrules == 0)
2865	    return collseqmb[br_elem->opr.ch];
2866	  else
2867	    {
2868	      wint_t wc = __btowc (br_elem->opr.ch);
2869	      return __collseq_table_lookup (collseqwc, wc);
2870	    }
2871	}
2872      else if (br_elem->type == MB_CHAR)
2873	{
2874	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2875	}
2876      else if (br_elem->type == COLL_SYM)
2877	{
2878	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2879	  if (nrules != 0)
2880	    {
2881	      int32_t elem, idx;
2882	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2883						  sym_name_len);
2884	      if (symb_table[2 * elem] != 0)
2885		{
2886		  /* We found the entry.  */
2887		  idx = symb_table[2 * elem + 1];
2888		  /* Skip the name of collating element name.  */
2889		  idx += 1 + extra[idx];
2890		  /* Skip the byte sequence of the collating element.  */
2891		  idx += 1 + extra[idx];
2892		  /* Adjust for the alignment.  */
2893		  idx = (idx + 3) & ~3;
2894		  /* Skip the multibyte collation sequence value.  */
2895		  idx += sizeof (unsigned int);
2896		  /* Skip the wide char sequence of the collating element.  */
2897		  idx += sizeof (unsigned int) *
2898		    (1 + *(unsigned int *) (extra + idx));
2899		  /* Return the collation sequence value.  */
2900		  return *(unsigned int *) (extra + idx);
2901		}
2902	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2903		{
2904		  /* No valid character.  Match it as a single byte
2905		     character.  */
2906		  return collseqmb[br_elem->opr.name[0]];
2907		}
2908	    }
2909	  else if (sym_name_len == 1)
2910	    return collseqmb[br_elem->opr.name[0]];
2911	}
2912      return UINT_MAX;
2913    }
2914
2915  /* Local function for parse_bracket_exp used in _LIBC environement.
2916     Build the range expression which starts from START_ELEM, and ends
2917     at END_ELEM.  The result are written to MBCSET and SBCSET.
2918     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2919     mbcset->range_ends, is a pointer argument sinse we may
2920     update it.  */
2921
2922  auto inline reg_errcode_t
2923  __attribute ((always_inline))
2924  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2925	 re_charset_t *mbcset;
2926	 int *range_alloc;
2927	 re_bitset_ptr_t sbcset;
2928	 bracket_elem_t *start_elem, *end_elem;
2929    {
2930      unsigned int ch;
2931      uint32_t start_collseq;
2932      uint32_t end_collseq;
2933
2934      /* Equivalence Classes and Character Classes can't be a range
2935	 start/end.  */
2936      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2937	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2938	      0))
2939	return REG_ERANGE;
2940
2941      start_collseq = lookup_collation_sequence_value (start_elem);
2942      end_collseq = lookup_collation_sequence_value (end_elem);
2943      /* Check start/end collation sequence values.  */
2944      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2945	return REG_ECOLLATE;
2946      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2947	return REG_ERANGE;
2948
2949      /* Got valid collation sequence values, add them as a new entry.
2950	 However, if we have no collation elements, and the character set
2951	 is single byte, the single byte character set that we
2952	 build below suffices. */
2953      if (nrules > 0 || dfa->mb_cur_max > 1)
2954	{
2955          /* Check the space of the arrays.  */
2956          if (BE (*range_alloc == mbcset->nranges, 0))
2957	    {
2958	      /* There is not enough space, need realloc.  */
2959	      uint32_t *new_array_start;
2960	      uint32_t *new_array_end;
2961	      int new_nranges;
2962
2963	      /* +1 in case of mbcset->nranges is 0.  */
2964	      new_nranges = 2 * mbcset->nranges + 1;
2965	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2966					    new_nranges);
2967	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2968				          new_nranges);
2969
2970	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2971	        return REG_ESPACE;
2972
2973	      mbcset->range_starts = new_array_start;
2974	      mbcset->range_ends = new_array_end;
2975	      *range_alloc = new_nranges;
2976	    }
2977
2978          mbcset->range_starts[mbcset->nranges] = start_collseq;
2979          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2980	}
2981
2982      /* Build the table for single byte characters.  */
2983      for (ch = 0; ch < SBC_MAX; ch++)
2984	{
2985	  uint32_t ch_collseq;
2986	  /*
2987	  if (MB_CUR_MAX == 1)
2988	  */
2989	  if (nrules == 0)
2990	    ch_collseq = collseqmb[ch];
2991	  else
2992	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2993	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2994	    bitset_set (sbcset, ch);
2995	}
2996      return REG_NOERROR;
2997    }
2998
2999  /* Local function for parse_bracket_exp used in _LIBC environement.
3000     Build the collating element which is represented by NAME.
3001     The result are written to MBCSET and SBCSET.
3002     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3003     pointer argument sinse we may update it.  */
3004
3005  auto inline reg_errcode_t
3006  __attribute ((always_inline))
3007  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3008	 re_charset_t *mbcset;
3009	 int *coll_sym_alloc;
3010	 re_bitset_ptr_t sbcset;
3011	 const unsigned char *name;
3012    {
3013      int32_t elem, idx;
3014      size_t name_len = strlen ((const char *) name);
3015      if (nrules != 0)
3016	{
3017	  elem = seek_collating_symbol_entry (name, name_len);
3018	  if (symb_table[2 * elem] != 0)
3019	    {
3020	      /* We found the entry.  */
3021	      idx = symb_table[2 * elem + 1];
3022	      /* Skip the name of collating element name.  */
3023	      idx += 1 + extra[idx];
3024	    }
3025	  else if (symb_table[2 * elem] == 0 && name_len == 1)
3026	    {
3027	      /* No valid character, treat it as a normal
3028		 character.  */
3029	      bitset_set (sbcset, name[0]);
3030	      return REG_NOERROR;
3031	    }
3032	  else
3033	    return REG_ECOLLATE;
3034
3035	  /* Got valid collation sequence, add it as a new entry.  */
3036	  /* Check the space of the arrays.  */
3037	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3038	    {
3039	      /* Not enough, realloc it.  */
3040	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3041	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3042	      /* Use realloc since mbcset->coll_syms is NULL
3043		 if *alloc == 0.  */
3044	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3045						   new_coll_sym_alloc);
3046	      if (BE (new_coll_syms == NULL, 0))
3047		return REG_ESPACE;
3048	      mbcset->coll_syms = new_coll_syms;
3049	      *coll_sym_alloc = new_coll_sym_alloc;
3050	    }
3051	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3052	  return REG_NOERROR;
3053	}
3054      else
3055	{
3056	  if (BE (name_len != 1, 0))
3057	    return REG_ECOLLATE;
3058	  else
3059	    {
3060	      bitset_set (sbcset, name[0]);
3061	      return REG_NOERROR;
3062	    }
3063	}
3064    }
3065#endif
3066
3067  re_token_t br_token;
3068  re_bitset_ptr_t sbcset;
3069#ifdef RE_ENABLE_I18N
3070  re_charset_t *mbcset;
3071  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3072  int equiv_class_alloc = 0, char_class_alloc = 0;
3073#endif /* not RE_ENABLE_I18N */
3074  int non_match = 0;
3075  bin_tree_t *work_tree;
3076  int token_len;
3077  int first_round = 1;
3078#ifdef _LIBC
3079  collseqmb = (const unsigned char *)
3080    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3081  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3082  if (nrules)
3083    {
3084      /*
3085      if (MB_CUR_MAX > 1)
3086      */
3087	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3088      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3089      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3090						  _NL_COLLATE_SYMB_TABLEMB);
3091      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3092						   _NL_COLLATE_SYMB_EXTRAMB);
3093    }
3094#endif
3095  sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3096#ifdef RE_ENABLE_I18N
3097  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3098#endif /* RE_ENABLE_I18N */
3099#ifdef RE_ENABLE_I18N
3100  if (BE (sbcset == NULL || mbcset == NULL, 0))
3101#else
3102  if (BE (sbcset == NULL, 0))
3103#endif /* RE_ENABLE_I18N */
3104    {
3105      *err = REG_ESPACE;
3106      return NULL;
3107    }
3108
3109  token_len = peek_token_bracket (token, regexp, syntax);
3110  if (BE (token->type == END_OF_RE, 0))
3111    {
3112      *err = REG_BADPAT;
3113      goto parse_bracket_exp_free_return;
3114    }
3115  if (token->type == OP_NON_MATCH_LIST)
3116    {
3117#ifdef RE_ENABLE_I18N
3118      mbcset->non_match = 1;
3119#endif /* not RE_ENABLE_I18N */
3120      non_match = 1;
3121      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3122	bitset_set (sbcset, '\0');
3123      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3124      token_len = peek_token_bracket (token, regexp, syntax);
3125      if (BE (token->type == END_OF_RE, 0))
3126	{
3127	  *err = REG_BADPAT;
3128	  goto parse_bracket_exp_free_return;
3129	}
3130    }
3131
3132  /* We treat the first ']' as a normal character.  */
3133  if (token->type == OP_CLOSE_BRACKET)
3134    token->type = CHARACTER;
3135
3136  while (1)
3137    {
3138      bracket_elem_t start_elem, end_elem;
3139      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3140      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3141      reg_errcode_t ret;
3142      int token_len2 = 0, is_range_exp = 0;
3143      re_token_t token2;
3144
3145      start_elem.opr.name = start_name_buf;
3146      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3147				   syntax, first_round);
3148      if (BE (ret != REG_NOERROR, 0))
3149	{
3150	  *err = ret;
3151	  goto parse_bracket_exp_free_return;
3152	}
3153      first_round = 0;
3154
3155      /* Get information about the next token.  We need it in any case.  */
3156      token_len = peek_token_bracket (token, regexp, syntax);
3157
3158      /* Do not check for ranges if we know they are not allowed.  */
3159      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3160	{
3161	  if (BE (token->type == END_OF_RE, 0))
3162	    {
3163	      *err = REG_EBRACK;
3164	      goto parse_bracket_exp_free_return;
3165	    }
3166	  if (token->type == OP_CHARSET_RANGE)
3167	    {
3168	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3169	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3170	      if (BE (token2.type == END_OF_RE, 0))
3171		{
3172		  *err = REG_EBRACK;
3173		  goto parse_bracket_exp_free_return;
3174		}
3175	      if (token2.type == OP_CLOSE_BRACKET)
3176		{
3177		  /* We treat the last '-' as a normal character.  */
3178		  re_string_skip_bytes (regexp, -token_len);
3179		  token->type = CHARACTER;
3180		}
3181	      else
3182		is_range_exp = 1;
3183	    }
3184	}
3185
3186      if (is_range_exp == 1)
3187	{
3188	  end_elem.opr.name = end_name_buf;
3189	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3190				       dfa, syntax, 1);
3191	  if (BE (ret != REG_NOERROR, 0))
3192	    {
3193	      *err = ret;
3194	      goto parse_bracket_exp_free_return;
3195	    }
3196
3197	  token_len = peek_token_bracket (token, regexp, syntax);
3198
3199#ifdef _LIBC
3200	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3201				  &start_elem, &end_elem);
3202#else
3203# ifdef RE_ENABLE_I18N
3204	  *err = build_range_exp (sbcset,
3205				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3206				  &range_alloc, &start_elem, &end_elem);
3207# else
3208	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3209# endif
3210#endif /* RE_ENABLE_I18N */
3211	  if (BE (*err != REG_NOERROR, 0))
3212	    goto parse_bracket_exp_free_return;
3213	}
3214      else
3215	{
3216	  switch (start_elem.type)
3217	    {
3218	    case SB_CHAR:
3219	      bitset_set (sbcset, start_elem.opr.ch);
3220	      break;
3221#ifdef RE_ENABLE_I18N
3222	    case MB_CHAR:
3223	      /* Check whether the array has enough space.  */
3224	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3225		{
3226		  wchar_t *new_mbchars;
3227		  /* Not enough, realloc it.  */
3228		  /* +1 in case of mbcset->nmbchars is 0.  */
3229		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3230		  /* Use realloc since array is NULL if *alloc == 0.  */
3231		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3232					    mbchar_alloc);
3233		  if (BE (new_mbchars == NULL, 0))
3234		    goto parse_bracket_exp_espace;
3235		  mbcset->mbchars = new_mbchars;
3236		}
3237	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3238	      break;
3239#endif /* RE_ENABLE_I18N */
3240	    case EQUIV_CLASS:
3241	      *err = build_equiv_class (sbcset,
3242#ifdef RE_ENABLE_I18N
3243					mbcset, &equiv_class_alloc,
3244#endif /* RE_ENABLE_I18N */
3245					start_elem.opr.name);
3246	      if (BE (*err != REG_NOERROR, 0))
3247		goto parse_bracket_exp_free_return;
3248	      break;
3249	    case COLL_SYM:
3250	      *err = build_collating_symbol (sbcset,
3251#ifdef RE_ENABLE_I18N
3252					     mbcset, &coll_sym_alloc,
3253#endif /* RE_ENABLE_I18N */
3254					     start_elem.opr.name);
3255	      if (BE (*err != REG_NOERROR, 0))
3256		goto parse_bracket_exp_free_return;
3257	      break;
3258	    case CHAR_CLASS:
3259	      *err = build_charclass (regexp->trans, sbcset,
3260#ifdef RE_ENABLE_I18N
3261				      mbcset, &char_class_alloc,
3262#endif /* RE_ENABLE_I18N */
3263				      start_elem.opr.name, syntax);
3264	      if (BE (*err != REG_NOERROR, 0))
3265	       goto parse_bracket_exp_free_return;
3266	      break;
3267	    default:
3268	      assert (0);
3269	      break;
3270	    }
3271	}
3272      if (BE (token->type == END_OF_RE, 0))
3273	{
3274	  *err = REG_EBRACK;
3275	  goto parse_bracket_exp_free_return;
3276	}
3277      if (token->type == OP_CLOSE_BRACKET)
3278	break;
3279    }
3280
3281  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3282
3283  /* If it is non-matching list.  */
3284  if (non_match)
3285    bitset_not (sbcset);
3286
3287#ifdef RE_ENABLE_I18N
3288  /* Ensure only single byte characters are set.  */
3289  if (dfa->mb_cur_max > 1)
3290    bitset_mask (sbcset, dfa->sb_char);
3291
3292  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3293      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3294						     || mbcset->non_match)))
3295    {
3296      bin_tree_t *mbc_tree;
3297      int sbc_idx;
3298      /* Build a tree for complex bracket.  */
3299      dfa->has_mb_node = 1;
3300      br_token.type = COMPLEX_BRACKET;
3301      br_token.opr.mbcset = mbcset;
3302      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3303      if (BE (mbc_tree == NULL, 0))
3304	goto parse_bracket_exp_espace;
3305      for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3306	if (sbcset[sbc_idx])
3307	  break;
3308      /* If there are no bits set in sbcset, there is no point
3309	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3310      if (sbc_idx < BITSET_UINTS)
3311	{
3312          /* Build a tree for simple bracket.  */
3313          br_token.type = SIMPLE_BRACKET;
3314          br_token.opr.sbcset = sbcset;
3315          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3316          if (BE (work_tree == NULL, 0))
3317            goto parse_bracket_exp_espace;
3318
3319          /* Then join them by ALT node.  */
3320          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3321          if (BE (work_tree == NULL, 0))
3322            goto parse_bracket_exp_espace;
3323	}
3324      else
3325	{
3326	  re_free (sbcset);
3327	  work_tree = mbc_tree;
3328	}
3329    }
3330  else
3331#endif /* not RE_ENABLE_I18N */
3332    {
3333#ifdef RE_ENABLE_I18N
3334      free_charset (mbcset);
3335#endif
3336      /* Build a tree for simple bracket.  */
3337      br_token.type = SIMPLE_BRACKET;
3338      br_token.opr.sbcset = sbcset;
3339      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3340      if (BE (work_tree == NULL, 0))
3341        goto parse_bracket_exp_espace;
3342    }
3343  return work_tree;
3344
3345 parse_bracket_exp_espace:
3346  *err = REG_ESPACE;
3347 parse_bracket_exp_free_return:
3348  re_free (sbcset);
3349#ifdef RE_ENABLE_I18N
3350  free_charset (mbcset);
3351#endif /* RE_ENABLE_I18N */
3352  return NULL;
3353}
3354
3355/* Parse an element in the bracket expression.  */
3356
3357static reg_errcode_t
3358parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3359		       accept_hyphen)
3360     bracket_elem_t *elem;
3361     re_string_t *regexp;
3362     re_token_t *token;
3363     int token_len;
3364     re_dfa_t *dfa;
3365     reg_syntax_t syntax;
3366     int accept_hyphen;
3367{
3368#ifdef RE_ENABLE_I18N
3369  int cur_char_size;
3370  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3371  if (cur_char_size > 1)
3372    {
3373      elem->type = MB_CHAR;
3374      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3375      re_string_skip_bytes (regexp, cur_char_size);
3376      return REG_NOERROR;
3377    }
3378#endif /* RE_ENABLE_I18N */
3379  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3380  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3381      || token->type == OP_OPEN_EQUIV_CLASS)
3382    return parse_bracket_symbol (elem, regexp, token);
3383  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3384    {
3385      /* A '-' must only appear as anything but a range indicator before
3386	 the closing bracket.  Everything else is an error.  */
3387      re_token_t token2;
3388      (void) peek_token_bracket (&token2, regexp, syntax);
3389      if (token2.type != OP_CLOSE_BRACKET)
3390	/* The actual error value is not standardized since this whole
3391	   case is undefined.  But ERANGE makes good sense.  */
3392	return REG_ERANGE;
3393    }
3394  elem->type = SB_CHAR;
3395  elem->opr.ch = token->opr.c;
3396  return REG_NOERROR;
3397}
3398
3399/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3400   such as [:<character_class>:], [.<collating_element>.], and
3401   [=<equivalent_class>=].  */
3402
3403static reg_errcode_t
3404parse_bracket_symbol (elem, regexp, token)
3405     bracket_elem_t *elem;
3406     re_string_t *regexp;
3407     re_token_t *token;
3408{
3409  unsigned char ch, delim = token->opr.c;
3410  int i = 0;
3411  if (re_string_eoi(regexp))
3412    return REG_EBRACK;
3413  for (;; ++i)
3414    {
3415      if (i >= BRACKET_NAME_BUF_SIZE)
3416	return REG_EBRACK;
3417      if (token->type == OP_OPEN_CHAR_CLASS)
3418	ch = re_string_fetch_byte_case (regexp);
3419      else
3420	ch = re_string_fetch_byte (regexp);
3421      if (re_string_eoi(regexp))
3422	return REG_EBRACK;
3423      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3424	break;
3425      elem->opr.name[i] = ch;
3426    }
3427  re_string_skip_bytes (regexp, 1);
3428  elem->opr.name[i] = '\0';
3429  switch (token->type)
3430    {
3431    case OP_OPEN_COLL_ELEM:
3432      elem->type = COLL_SYM;
3433      break;
3434    case OP_OPEN_EQUIV_CLASS:
3435      elem->type = EQUIV_CLASS;
3436      break;
3437    case OP_OPEN_CHAR_CLASS:
3438      elem->type = CHAR_CLASS;
3439      break;
3440    default:
3441      break;
3442    }
3443  return REG_NOERROR;
3444}
3445
3446  /* Helper function for parse_bracket_exp.
3447     Build the equivalence class which is represented by NAME.
3448     The result are written to MBCSET and SBCSET.
3449     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3450     is a pointer argument sinse we may update it.  */
3451
3452static reg_errcode_t
3453#ifdef RE_ENABLE_I18N
3454build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3455     re_charset_t *mbcset;
3456     int *equiv_class_alloc;
3457#else /* not RE_ENABLE_I18N */
3458build_equiv_class (sbcset, name)
3459#endif /* not RE_ENABLE_I18N */
3460     re_bitset_ptr_t sbcset;
3461     const unsigned char *name;
3462{
3463#if defined _LIBC
3464  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3465  if (nrules != 0)
3466    {
3467      const int32_t *table, *indirect;
3468      const unsigned char *weights, *extra, *cp;
3469      unsigned char char_buf[2];
3470      int32_t idx1, idx2;
3471      unsigned int ch;
3472      size_t len;
3473      /* This #include defines a local function!  */
3474# include <locale/weight.h>
3475      /* Calculate the index for equivalence class.  */
3476      cp = name;
3477      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3478      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3479					       _NL_COLLATE_WEIGHTMB);
3480      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3481						   _NL_COLLATE_EXTRAMB);
3482      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3483						_NL_COLLATE_INDIRECTMB);
3484      idx1 = findidx (&cp);
3485      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3486	/* This isn't a valid character.  */
3487	return REG_ECOLLATE;
3488
3489      /* Build single byte matcing table for this equivalence class.  */
3490      char_buf[1] = (unsigned char) '\0';
3491      len = weights[idx1];
3492      for (ch = 0; ch < SBC_MAX; ++ch)
3493	{
3494	  char_buf[0] = ch;
3495	  cp = char_buf;
3496	  idx2 = findidx (&cp);
3497/*
3498	  idx2 = table[ch];
3499*/
3500	  if (idx2 == 0)
3501	    /* This isn't a valid character.  */
3502	    continue;
3503	  if (len == weights[idx2])
3504	    {
3505	      int cnt = 0;
3506	      while (cnt <= len &&
3507		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3508		++cnt;
3509
3510	      if (cnt > len)
3511		bitset_set (sbcset, ch);
3512	    }
3513	}
3514      /* Check whether the array has enough space.  */
3515      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3516	{
3517	  /* Not enough, realloc it.  */
3518	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3519	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3520	  /* Use realloc since the array is NULL if *alloc == 0.  */
3521	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3522						   int32_t,
3523						   new_equiv_class_alloc);
3524	  if (BE (new_equiv_classes == NULL, 0))
3525	    return REG_ESPACE;
3526	  mbcset->equiv_classes = new_equiv_classes;
3527	  *equiv_class_alloc = new_equiv_class_alloc;
3528	}
3529      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3530    }
3531  else
3532#endif /* _LIBC */
3533    {
3534      if (BE (strlen ((const char *) name) != 1, 0))
3535	return REG_ECOLLATE;
3536      bitset_set (sbcset, *name);
3537    }
3538  return REG_NOERROR;
3539}
3540
3541  /* Helper function for parse_bracket_exp.
3542     Build the character class which is represented by NAME.
3543     The result are written to MBCSET and SBCSET.
3544     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3545     is a pointer argument sinse we may update it.  */
3546
3547static reg_errcode_t
3548#ifdef RE_ENABLE_I18N
3549build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3550     re_charset_t *mbcset;
3551     int *char_class_alloc;
3552#else /* not RE_ENABLE_I18N */
3553build_charclass (trans, sbcset, class_name, syntax)
3554#endif /* not RE_ENABLE_I18N */
3555     unsigned RE_TRANSLATE_TYPE trans;
3556     re_bitset_ptr_t sbcset;
3557     const unsigned char *class_name;
3558     reg_syntax_t syntax;
3559{
3560  int i;
3561  const char *name = (const char *) class_name;
3562
3563  /* In case of REG_ICASE "upper" and "lower" match the both of
3564     upper and lower cases.  */
3565  if ((syntax & RE_ICASE)
3566      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3567    name = "alpha";
3568
3569#ifdef RE_ENABLE_I18N
3570  /* Check the space of the arrays.  */
3571  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3572    {
3573      /* Not enough, realloc it.  */
3574      /* +1 in case of mbcset->nchar_classes is 0.  */
3575      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3576      /* Use realloc since array is NULL if *alloc == 0.  */
3577      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3578					       new_char_class_alloc);
3579      if (BE (new_char_classes == NULL, 0))
3580	return REG_ESPACE;
3581      mbcset->char_classes = new_char_classes;
3582      *char_class_alloc = new_char_class_alloc;
3583    }
3584  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3585#endif /* RE_ENABLE_I18N */
3586
3587#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3588    for (i = 0; i < SBC_MAX; ++i)		\
3589      {						\
3590	if (ctype_func (i))			\
3591	  {					\
3592	    int ch = trans ? trans[i] : i;	\
3593	    bitset_set (sbcset, ch);		\
3594	  }					\
3595      }
3596
3597  if (strcmp (name, "alnum") == 0)
3598    BUILD_CHARCLASS_LOOP (isalnum)
3599  else if (strcmp (name, "cntrl") == 0)
3600    BUILD_CHARCLASS_LOOP (iscntrl)
3601  else if (strcmp (name, "lower") == 0)
3602    BUILD_CHARCLASS_LOOP (islower)
3603  else if (strcmp (name, "space") == 0)
3604    BUILD_CHARCLASS_LOOP (isspace)
3605  else if (strcmp (name, "alpha") == 0)
3606    BUILD_CHARCLASS_LOOP (isalpha)
3607  else if (strcmp (name, "digit") == 0)
3608    BUILD_CHARCLASS_LOOP (isdigit)
3609  else if (strcmp (name, "print") == 0)
3610    BUILD_CHARCLASS_LOOP (isprint)
3611  else if (strcmp (name, "upper") == 0)
3612    BUILD_CHARCLASS_LOOP (isupper)
3613  else if (strcmp (name, "blank") == 0)
3614    BUILD_CHARCLASS_LOOP (isblank)
3615  else if (strcmp (name, "graph") == 0)
3616    BUILD_CHARCLASS_LOOP (isgraph)
3617  else if (strcmp (name, "punct") == 0)
3618    BUILD_CHARCLASS_LOOP (ispunct)
3619  else if (strcmp (name, "xdigit") == 0)
3620    BUILD_CHARCLASS_LOOP (isxdigit)
3621  else
3622    return REG_ECTYPE;
3623
3624  return REG_NOERROR;
3625}
3626
3627static bin_tree_t *
3628build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3629     re_dfa_t *dfa;
3630     unsigned RE_TRANSLATE_TYPE trans;
3631     const unsigned char *class_name;
3632     const unsigned char *extra;
3633     int non_match;
3634     reg_errcode_t *err;
3635{
3636  re_bitset_ptr_t sbcset;
3637#ifdef RE_ENABLE_I18N
3638  re_charset_t *mbcset;
3639  int alloc = 0;
3640#endif /* not RE_ENABLE_I18N */
3641  reg_errcode_t ret;
3642  re_token_t br_token;
3643  bin_tree_t *tree;
3644
3645  sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3646#ifdef RE_ENABLE_I18N
3647  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3648#endif /* RE_ENABLE_I18N */
3649
3650#ifdef RE_ENABLE_I18N
3651  if (BE (sbcset == NULL || mbcset == NULL, 0))
3652#else /* not RE_ENABLE_I18N */
3653  if (BE (sbcset == NULL, 0))
3654#endif /* not RE_ENABLE_I18N */
3655    {
3656      *err = REG_ESPACE;
3657      return NULL;
3658    }
3659
3660  if (non_match)
3661    {
3662#ifdef RE_ENABLE_I18N
3663      /*
3664      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3665	bitset_set(cset->sbcset, '\0');
3666      */
3667      mbcset->non_match = 1;
3668#endif /* not RE_ENABLE_I18N */
3669    }
3670
3671  /* We don't care the syntax in this case.  */
3672  ret = build_charclass (trans, sbcset,
3673#ifdef RE_ENABLE_I18N
3674			 mbcset, &alloc,
3675#endif /* RE_ENABLE_I18N */
3676			 class_name, 0);
3677
3678  if (BE (ret != REG_NOERROR, 0))
3679    {
3680      re_free (sbcset);
3681#ifdef RE_ENABLE_I18N
3682      free_charset (mbcset);
3683#endif /* RE_ENABLE_I18N */
3684      *err = ret;
3685      return NULL;
3686    }
3687  /* \w match '_' also.  */
3688  for (; *extra; extra++)
3689    bitset_set (sbcset, *extra);
3690
3691  /* If it is non-matching list.  */
3692  if (non_match)
3693    bitset_not (sbcset);
3694
3695#ifdef RE_ENABLE_I18N
3696  /* Ensure only single byte characters are set.  */
3697  if (dfa->mb_cur_max > 1)
3698    bitset_mask (sbcset, dfa->sb_char);
3699#endif
3700
3701  /* Build a tree for simple bracket.  */
3702  br_token.type = SIMPLE_BRACKET;
3703  br_token.opr.sbcset = sbcset;
3704  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3705  if (BE (tree == NULL, 0))
3706    goto build_word_op_espace;
3707
3708#ifdef RE_ENABLE_I18N
3709  if (dfa->mb_cur_max > 1)
3710    {
3711      bin_tree_t *mbc_tree;
3712      /* Build a tree for complex bracket.  */
3713      br_token.type = COMPLEX_BRACKET;
3714      br_token.opr.mbcset = mbcset;
3715      dfa->has_mb_node = 1;
3716      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3717      if (BE (mbc_tree == NULL, 0))
3718	goto build_word_op_espace;
3719      /* Then join them by ALT node.  */
3720      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3721      if (BE (mbc_tree != NULL, 1))
3722	return tree;
3723    }
3724  else
3725    {
3726      free_charset (mbcset);
3727      return tree;
3728    }
3729#else /* not RE_ENABLE_I18N */
3730  return tree;
3731#endif /* not RE_ENABLE_I18N */
3732
3733 build_word_op_espace:
3734  re_free (sbcset);
3735#ifdef RE_ENABLE_I18N
3736  free_charset (mbcset);
3737#endif /* RE_ENABLE_I18N */
3738  *err = REG_ESPACE;
3739  return NULL;
3740}
3741
3742/* This is intended for the expressions like "a{1,3}".
3743   Fetch a number from `input', and return the number.
3744   Return -1, if the number field is empty like "{,1}".
3745   Return -2, If an error is occured.  */
3746
3747static int
3748fetch_number (input, token, syntax)
3749     re_string_t *input;
3750     re_token_t *token;
3751     reg_syntax_t syntax;
3752{
3753  int num = -1;
3754  unsigned char c;
3755  while (1)
3756    {
3757      fetch_token (token, input, syntax);
3758      c = token->opr.c;
3759      if (BE (token->type == END_OF_RE, 0))
3760	return -2;
3761      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3762	break;
3763      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3764	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3765      num = (num > RE_DUP_MAX) ? -2 : num;
3766    }
3767  return num;
3768}
3769
3770#ifdef RE_ENABLE_I18N
3771static void
3772free_charset (re_charset_t *cset)
3773{
3774  re_free (cset->mbchars);
3775# ifdef _LIBC
3776  re_free (cset->coll_syms);
3777  re_free (cset->equiv_classes);
3778  re_free (cset->range_starts);
3779  re_free (cset->range_ends);
3780# endif
3781  re_free (cset->char_classes);
3782  re_free (cset);
3783}
3784#endif /* RE_ENABLE_I18N */
3785
3786/* Functions for binary tree operation.  */
3787
3788/* Create a tree node.  */
3789
3790static bin_tree_t *
3791create_tree (dfa, left, right, type)
3792     re_dfa_t *dfa;
3793     bin_tree_t *left;
3794     bin_tree_t *right;
3795     re_token_type_t type;
3796{
3797  re_token_t t;
3798  t.type = type;
3799  return create_token_tree (dfa, left, right, &t);
3800}
3801
3802static bin_tree_t *
3803create_token_tree (dfa, left, right, token)
3804     re_dfa_t *dfa;
3805     bin_tree_t *left;
3806     bin_tree_t *right;
3807     const re_token_t *token;
3808{
3809  bin_tree_t *tree;
3810  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3811    {
3812      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3813
3814      if (storage == NULL)
3815	return NULL;
3816      storage->next = dfa->str_tree_storage;
3817      dfa->str_tree_storage = storage;
3818      dfa->str_tree_storage_idx = 0;
3819    }
3820  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3821
3822  tree->parent = NULL;
3823  tree->left = left;
3824  tree->right = right;
3825  tree->token = *token;
3826  tree->token.duplicated = 0;
3827  tree->token.opt_subexp = 0;
3828  tree->first = NULL;
3829  tree->next = NULL;
3830  tree->node_idx = -1;
3831
3832  if (left != NULL)
3833    left->parent = tree;
3834  if (right != NULL)
3835    right->parent = tree;
3836  return tree;
3837}
3838
3839/* Mark the tree SRC as an optional subexpression.
3840   To be called from preorder or postorder.  */
3841
3842static reg_errcode_t
3843mark_opt_subexp (extra, node)
3844     void *extra;
3845     bin_tree_t *node;
3846{
3847  int idx = (int) (long) extra;
3848  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849    node->token.opt_subexp = 1;
3850
3851  return REG_NOERROR;
3852}
3853
3854/* Free the allocated memory inside NODE. */
3855
3856static void
3857free_token (re_token_t *node)
3858{
3859#ifdef RE_ENABLE_I18N
3860  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861    free_charset (node->opr.mbcset);
3862  else
3863#endif /* RE_ENABLE_I18N */
3864    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865      re_free (node->opr.sbcset);
3866}
3867
3868/* Worker function for tree walking.  Free the allocated memory inside NODE
3869   and its children. */
3870
3871static reg_errcode_t
3872free_tree (void *extra, bin_tree_t *node)
3873{
3874  free_token (&node->token);
3875  return REG_NOERROR;
3876}
3877
3878
3879/* Duplicate the node SRC, and return new node.  This is a preorder
3880   visit similar to the one implemented by the generic visitor, but
3881   we need more infrastructure to maintain two parallel trees --- so,
3882   it's easier to duplicate.  */
3883
3884static bin_tree_t *
3885duplicate_tree (root, dfa)
3886     const bin_tree_t *root;
3887     re_dfa_t *dfa;
3888{
3889  const bin_tree_t *node;
3890  bin_tree_t *dup_root;
3891  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3892
3893  for (node = root; ; )
3894    {
3895      /* Create a new tree and link it back to the current parent.  */
3896      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3897      if (*p_new == NULL)
3898	return NULL;
3899      (*p_new)->parent = dup_node;
3900      (*p_new)->token.duplicated = 1;
3901      dup_node = *p_new;
3902
3903      /* Go to the left node, or up and to the right.  */
3904      if (node->left)
3905	{
3906	  node = node->left;
3907	  p_new = &dup_node->left;
3908	}
3909      else
3910	{
3911	  const bin_tree_t *prev = NULL;
3912	  while (node->right == prev || node->right == NULL)
3913	    {
3914	      prev = node;
3915	      node = node->parent;
3916	      dup_node = dup_node->parent;
3917	      if (!node)
3918	        return dup_root;
3919	    }
3920	  node = node->right;
3921	  p_new = &dup_node->right;
3922	}
3923    }
3924}
3925