1/* Extended regular expression matching and search library.
2   Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 3, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License along
17   with this program; if not, write to the Free Software Foundation,
18   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21					  size_t length, reg_syntax_t syntax);
22static void re_compile_fastmap_iter (regex_t *bufp,
23				     const re_dfastate_t *init_state,
24				     char *fastmap);
25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26#ifdef RE_ENABLE_I18N
27static void free_charset (re_charset_t *cset);
28#endif /* RE_ENABLE_I18N */
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36			       reg_errcode_t (fn (void *, bin_tree_t *)),
37			       void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39				reg_errcode_t (fn (void *, bin_tree_t *)),
40				void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44				 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50				   unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53					 Idx node, bool root);
54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55static Idx fetch_number (re_string_t *input, re_token_t *token,
56			 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
58			reg_syntax_t syntax) internal_function;
59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60			  reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62				  re_token_t *token, reg_syntax_t syntax,
63				  Idx nest, reg_errcode_t *err);
64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65				 re_token_t *token, reg_syntax_t syntax,
66				 Idx nest, reg_errcode_t *err);
67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68				     re_token_t *token, reg_syntax_t syntax,
69				     Idx nest, reg_errcode_t *err);
70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71				  re_token_t *token, reg_syntax_t syntax,
72				  Idx nest, reg_errcode_t *err);
73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74				 re_dfa_t *dfa, re_token_t *token,
75				 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77				      re_token_t *token, reg_syntax_t syntax,
78				      reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80					    re_string_t *regexp,
81					    re_token_t *token, int token_len,
82					    re_dfa_t *dfa,
83					    reg_syntax_t syntax,
84					    bool accept_hyphen);
85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86					  re_string_t *regexp,
87					  re_token_t *token);
88#ifdef RE_ENABLE_I18N
89static reg_errcode_t build_equiv_class (bitset_t sbcset,
90					re_charset_t *mbcset,
91					Idx *equiv_class_alloc,
92					const unsigned char *name);
93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94				      bitset_t sbcset,
95				      re_charset_t *mbcset,
96				      Idx *char_class_alloc,
97				      const unsigned char *class_name,
98				      reg_syntax_t syntax);
99#else  /* not RE_ENABLE_I18N */
100static reg_errcode_t build_equiv_class (bitset_t sbcset,
101					const unsigned char *name);
102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103				      bitset_t sbcset,
104				      const unsigned char *class_name,
105				      reg_syntax_t syntax);
106#endif /* not RE_ENABLE_I18N */
107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108				       RE_TRANSLATE_TYPE trans,
109				       const unsigned char *class_name,
110				       const unsigned char *extra,
111				       bool non_match, reg_errcode_t *err);
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113				bin_tree_t *left, bin_tree_t *right,
114				re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116				      bin_tree_t *left, bin_tree_t *right,
117				      const re_token_t *token);
118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122
123/* This table gives an error message for each of the error codes listed
124   in regex.h.  Obviously the order here has to be same as there.
125   POSIX doesn't require that we do anything for REG_NOERROR,
126   but why not be nice?  */
127
128static const char __re_error_msgid[] =
129  {
130#define REG_NOERROR_IDX	0
131    gettext_noop ("Success")	/* REG_NOERROR */
132    "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134    gettext_noop ("No match")	/* REG_NOMATCH */
135    "\0"
136#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138    "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141    "\0"
142#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144    "\0"
145#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147    "\0"
148#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150    "\0"
151#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153    "\0"
154#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156    "\0"
157#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159    "\0"
160#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162    "\0"
163#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164    gettext_noop ("Invalid range end")	/* REG_ERANGE */
165    "\0"
166#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167    gettext_noop ("Memory exhausted") /* REG_ESPACE */
168    "\0"
169#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171    "\0"
172#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173    gettext_noop ("Premature end of regular expression") /* REG_EEND */
174    "\0"
175#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176    gettext_noop ("Regular expression too big") /* REG_ESIZE */
177    "\0"
178#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180  };
181
182static const size_t __re_error_msgid_idx[] =
183  {
184    REG_NOERROR_IDX,
185    REG_NOMATCH_IDX,
186    REG_BADPAT_IDX,
187    REG_ECOLLATE_IDX,
188    REG_ECTYPE_IDX,
189    REG_EESCAPE_IDX,
190    REG_ESUBREG_IDX,
191    REG_EBRACK_IDX,
192    REG_EPAREN_IDX,
193    REG_EBRACE_IDX,
194    REG_BADBR_IDX,
195    REG_ERANGE_IDX,
196    REG_ESPACE_IDX,
197    REG_BADRPT_IDX,
198    REG_EEND_IDX,
199    REG_ESIZE_IDX,
200    REG_ERPAREN_IDX
201  };
202
203/* Entry points for GNU code.  */
204
205/* re_compile_pattern is the GNU regular expression compiler: it
206   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207   Returns 0 if the pattern was valid, otherwise an error string.
208
209   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210   are set in BUFP on entry.  */
211
212#ifdef _LIBC
213const char *
214re_compile_pattern (pattern, length, bufp)
215    const char *pattern;
216    size_t length;
217    struct re_pattern_buffer *bufp;
218#else /* size_t might promote */
219const char *
220re_compile_pattern (const char *pattern, size_t length,
221		    struct re_pattern_buffer *bufp)
222#endif
223{
224  reg_errcode_t ret;
225
226  /* And GNU code determines whether or not to get register information
227     by passing null for the REGS argument to re_match, etc., not by
228     setting no_sub, unless RE_NO_SUB is set.  */
229  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230
231  /* Match anchors at newline.  */
232  bufp->newline_anchor = 1;
233
234  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235
236  if (!ret)
237    return NULL;
238  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239}
240#ifdef _LIBC
241weak_alias (__re_compile_pattern, re_compile_pattern)
242#endif
243
244/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
245   also be assigned to arbitrarily: each pattern buffer stores its own
246   syntax, so it can be changed between regex compilations.  */
247/* This has no initializer because initialized variables in Emacs
248   become read-only after dumping.  */
249reg_syntax_t re_syntax_options;
250
251
252/* Specify the precise syntax of regexps for compilation.  This provides
253   for compatibility for various utilities which historically have
254   different, incompatible syntaxes.
255
256   The argument SYNTAX is a bit mask comprised of the various bits
257   defined in regex.h.  We return the old syntax.  */
258
259reg_syntax_t
260re_set_syntax (syntax)
261    reg_syntax_t syntax;
262{
263  reg_syntax_t ret = re_syntax_options;
264
265  re_syntax_options = syntax;
266  return ret;
267}
268#ifdef _LIBC
269weak_alias (__re_set_syntax, re_set_syntax)
270#endif
271
272int
273re_compile_fastmap (bufp)
274    struct re_pattern_buffer *bufp;
275{
276  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277  char *fastmap = bufp->fastmap;
278
279  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281  if (dfa->init_state != dfa->init_state_word)
282    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283  if (dfa->init_state != dfa->init_state_nl)
284    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285  if (dfa->init_state != dfa->init_state_begbuf)
286    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287  bufp->fastmap_accurate = 1;
288  return 0;
289}
290#ifdef _LIBC
291weak_alias (__re_compile_fastmap, re_compile_fastmap)
292#endif
293
294static inline void
295__attribute ((always_inline))
296re_set_fastmap (char *fastmap, bool icase, int ch)
297{
298  fastmap[ch] = 1;
299  if (icase)
300    fastmap[tolower (ch)] = 1;
301}
302
303/* Helper function for re_compile_fastmap.
304   Compile fastmap for the initial_state INIT_STATE.  */
305
306static void
307re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308			 char *fastmap)
309{
310  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
311  Idx node_cnt;
312  bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314    {
315      Idx node = init_state->nodes.elems[node_cnt];
316      re_token_type_t type = dfa->nodes[node].type;
317
318      if (type == CHARACTER)
319	{
320	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321#ifdef RE_ENABLE_I18N
322	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323	    {
324	      unsigned char buf[MB_LEN_MAX];
325	      unsigned char *p;
326	      wchar_t wc;
327	      mbstate_t state;
328
329	      p = buf;
330	      *p++ = dfa->nodes[node].opr.c;
331	      while (++node < dfa->nodes_len
332		     &&	dfa->nodes[node].type == CHARACTER
333		     && dfa->nodes[node].mb_partial)
334		*p++ = dfa->nodes[node].opr.c;
335	      memset (&state, '\0', sizeof (state));
336	      if (mbrtowc (&wc, (const char *) buf, p - buf,
337			   &state) == p - buf
338		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
339		      != (size_t) -1))
340		re_set_fastmap (fastmap, false, buf[0]);
341	    }
342#endif
343	}
344      else if (type == SIMPLE_BRACKET)
345	{
346	  int i, ch;
347	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348	    {
349	      int j;
350	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352		if (w & ((bitset_word_t) 1 << j))
353		  re_set_fastmap (fastmap, icase, ch);
354	    }
355	}
356#ifdef RE_ENABLE_I18N
357      else if (type == COMPLEX_BRACKET)
358	{
359	  Idx i;
360	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
362	      || cset->nranges || cset->nchar_classes)
363	    {
364# ifdef _LIBC
365	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
366		{
367		  /* In this case we want to catch the bytes which are
368		     the first byte of any collation elements.
369		     e.g. In da_DK, we want to catch 'a' since "aa"
370			  is a valid collation element, and don't catch
371			  'b' since 'b' is the only collation element
372			  which starts from 'b'.  */
373		  const int32_t *table = (const int32_t *)
374		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375		  for (i = 0; i < SBC_MAX; ++i)
376		    if (table[i] < 0)
377		      re_set_fastmap (fastmap, icase, i);
378		}
379# else
380	      if (dfa->mb_cur_max > 1)
381		for (i = 0; i < SBC_MAX; ++i)
382		  if (__btowc (i) == WEOF)
383		    re_set_fastmap (fastmap, icase, i);
384# endif /* not _LIBC */
385	    }
386	  for (i = 0; i < cset->nmbchars; ++i)
387	    {
388	      char buf[256];
389	      mbstate_t state;
390	      memset (&state, '\0', sizeof (state));
391	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
392		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
393	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
394		{
395		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
396		      != (size_t) -1)
397		    re_set_fastmap (fastmap, false, *(unsigned char *) buf);
398		}
399	    }
400	}
401#endif /* RE_ENABLE_I18N */
402      else if (type == OP_PERIOD
403#ifdef RE_ENABLE_I18N
404	       || type == OP_UTF8_PERIOD
405#endif /* RE_ENABLE_I18N */
406	       || type == END_OF_RE)
407	{
408	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
409	  if (type == END_OF_RE)
410	    bufp->can_be_null = 1;
411	  return;
412	}
413    }
414}
415
416/* Entry point for POSIX code.  */
417/* regcomp takes a regular expression as a string and compiles it.
418
419   PREG is a regex_t *.  We do not expect any fields to be initialized,
420   since POSIX says we shouldn't.  Thus, we set
421
422     `buffer' to the compiled pattern;
423     `used' to the length of the compiled pattern;
424     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
425       REG_EXTENDED bit in CFLAGS is set; otherwise, to
426       RE_SYNTAX_POSIX_BASIC;
427     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
428     `fastmap' to an allocated space for the fastmap;
429     `fastmap_accurate' to zero;
430     `re_nsub' to the number of subexpressions in PATTERN.
431
432   PATTERN is the address of the pattern string.
433
434   CFLAGS is a series of bits which affect compilation.
435
436     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
437     use POSIX basic syntax.
438
439     If REG_NEWLINE is set, then . and [^...] don't match newline.
440     Also, regexec will try a match beginning after every newline.
441
442     If REG_ICASE is set, then we considers upper- and lowercase
443     versions of letters to be equivalent when matching.
444
445     If REG_NOSUB is set, then when PREG is passed to regexec, that
446     routine will report only success or failure, and nothing about the
447     registers.
448
449   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
450   the return codes and their meanings.)  */
451
452int
453regcomp (preg, pattern, cflags)
454    regex_t *_Restrict_ preg;
455    const char *_Restrict_ pattern;
456    int cflags;
457{
458  reg_errcode_t ret;
459  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
460			 : RE_SYNTAX_POSIX_BASIC);
461
462  preg->buffer = NULL;
463  preg->allocated = 0;
464  preg->used = 0;
465
466  /* Try to allocate space for the fastmap.  */
467  preg->fastmap = re_malloc (char, SBC_MAX);
468  if (BE (preg->fastmap == NULL, 0))
469    return REG_ESPACE;
470
471  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
472
473  /* If REG_NEWLINE is set, newlines are treated differently.  */
474  if (cflags & REG_NEWLINE)
475    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
476      syntax &= ~RE_DOT_NEWLINE;
477      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
478      /* It also changes the matching behavior.  */
479      preg->newline_anchor = 1;
480    }
481  else
482    preg->newline_anchor = 0;
483  preg->no_sub = !!(cflags & REG_NOSUB);
484  preg->translate = NULL;
485
486  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
487
488  /* POSIX doesn't distinguish between an unmatched open-group and an
489     unmatched close-group: both are REG_EPAREN.  */
490  if (ret == REG_ERPAREN)
491    ret = REG_EPAREN;
492
493  /* We have already checked preg->fastmap != NULL.  */
494  if (BE (ret == REG_NOERROR, 1))
495    /* Compute the fastmap now, since regexec cannot modify the pattern
496       buffer.  This function never fails in this implementation.  */
497    (void) re_compile_fastmap (preg);
498  else
499    {
500      /* Some error occurred while compiling the expression.  */
501      re_free (preg->fastmap);
502      preg->fastmap = NULL;
503    }
504
505  return (int) ret;
506}
507#ifdef _LIBC
508weak_alias (__regcomp, regcomp)
509#endif
510
511/* Returns a message corresponding to an error code, ERRCODE, returned
512   from either regcomp or regexec.   We don't use PREG here.  */
513
514#ifdef _LIBC
515size_t
516regerror (errcode, preg, errbuf, errbuf_size)
517    int errcode;
518    const regex_t *_Restrict_ preg;
519    char *_Restrict_ errbuf;
520    size_t errbuf_size;
521#else /* size_t might promote */
522size_t
523regerror (int errcode, const regex_t *_Restrict_ preg,
524	  char *_Restrict_ errbuf, size_t errbuf_size)
525#endif
526{
527  const char *msg;
528  size_t msg_size;
529
530  if (BE (errcode < 0
531	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
532			       / sizeof (__re_error_msgid_idx[0])), 0))
533    /* Only error codes returned by the rest of the code should be passed
534       to this routine.  If we are given anything else, or if other regex
535       code generates an invalid error code, then the program has a bug.
536       Dump core so we can fix it.  */
537    abort ();
538
539  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
540
541  msg_size = strlen (msg) + 1; /* Includes the null.  */
542
543  if (BE (errbuf_size != 0, 1))
544    {
545      size_t cpy_size = msg_size;
546      if (BE (msg_size > errbuf_size, 0))
547	{
548	  cpy_size = errbuf_size - 1;
549	  errbuf[cpy_size] = '\0';
550	}
551      memcpy (errbuf, msg, cpy_size);
552    }
553
554  return msg_size;
555}
556#ifdef _LIBC
557weak_alias (__regerror, regerror)
558#endif
559
560
561#ifdef RE_ENABLE_I18N
562/* This static array is used for the map to single-byte characters when
563   UTF-8 is used.  Otherwise we would allocate memory just to initialize
564   it the same all the time.  UTF-8 is the preferred encoding so this is
565   a worthwhile optimization.  */
566static const bitset_t utf8_sb_map =
567{
568  /* Set the first 128 bits.  */
569# if 4 * BITSET_WORD_BITS < ASCII_CHARS
570#  error "bitset_word_t is narrower than 32 bits"
571# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
572  BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
573# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
574  BITSET_WORD_MAX, BITSET_WORD_MAX,
575# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
576  BITSET_WORD_MAX,
577# endif
578  (BITSET_WORD_MAX
579   >> (SBC_MAX % BITSET_WORD_BITS == 0
580       ? 0
581       : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
582};
583#endif
584
585
586static void
587free_dfa_content (re_dfa_t *dfa)
588{
589  Idx i, j;
590
591  if (dfa->nodes)
592    for (i = 0; i < dfa->nodes_len; ++i)
593      free_token (dfa->nodes + i);
594  re_free (dfa->nexts);
595  for (i = 0; i < dfa->nodes_len; ++i)
596    {
597      if (dfa->eclosures != NULL)
598	re_node_set_free (dfa->eclosures + i);
599      if (dfa->inveclosures != NULL)
600	re_node_set_free (dfa->inveclosures + i);
601      if (dfa->edests != NULL)
602	re_node_set_free (dfa->edests + i);
603    }
604  re_free (dfa->edests);
605  re_free (dfa->eclosures);
606  re_free (dfa->inveclosures);
607  re_free (dfa->nodes);
608
609  if (dfa->state_table)
610    for (i = 0; i <= dfa->state_hash_mask; ++i)
611      {
612	struct re_state_table_entry *entry = dfa->state_table + i;
613	for (j = 0; j < entry->num; ++j)
614	  {
615	    re_dfastate_t *state = entry->array[j];
616	    free_state (state);
617	  }
618        re_free (entry->array);
619      }
620  re_free (dfa->state_table);
621#ifdef RE_ENABLE_I18N
622  if (dfa->sb_char != utf8_sb_map)
623    re_free (dfa->sb_char);
624#endif
625  re_free (dfa->subexp_map);
626#ifdef DEBUG
627  re_free (dfa->re_str);
628#endif
629
630  re_free (dfa);
631}
632
633
634/* Free dynamically allocated space used by PREG.  */
635
636void
637regfree (preg)
638    regex_t *preg;
639{
640  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
641  if (BE (dfa != NULL, 1))
642    free_dfa_content (dfa);
643  preg->buffer = NULL;
644  preg->allocated = 0;
645
646  re_free (preg->fastmap);
647  preg->fastmap = NULL;
648
649  re_free (preg->translate);
650  preg->translate = NULL;
651}
652#ifdef _LIBC
653weak_alias (__regfree, regfree)
654#endif
655
656/* Entry points compatible with 4.2 BSD regex library.  We don't define
657   them unless specifically requested.  */
658
659#if defined _REGEX_RE_COMP || defined _LIBC
660
661/* BSD has one and only one pattern buffer.  */
662static struct re_pattern_buffer re_comp_buf;
663
664char *
665# ifdef _LIBC
666/* Make these definitions weak in libc, so POSIX programs can redefine
667   these names if they don't use our functions, and still use
668   regcomp/regexec above without link errors.  */
669weak_function
670# endif
671re_comp (s)
672     const char *s;
673{
674  reg_errcode_t ret;
675  char *fastmap;
676
677  if (!s)
678    {
679      if (!re_comp_buf.buffer)
680	return gettext ("No previous regular expression");
681      return 0;
682    }
683
684  if (re_comp_buf.buffer)
685    {
686      fastmap = re_comp_buf.fastmap;
687      re_comp_buf.fastmap = NULL;
688      __regfree (&re_comp_buf);
689      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690      re_comp_buf.fastmap = fastmap;
691    }
692
693  if (re_comp_buf.fastmap == NULL)
694    {
695      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
696      if (re_comp_buf.fastmap == NULL)
697	return (char *) gettext (__re_error_msgid
698				 + __re_error_msgid_idx[(int) REG_ESPACE]);
699    }
700
701  /* Since `re_exec' always passes NULL for the `regs' argument, we
702     don't need to initialize the pattern buffer fields which affect it.  */
703
704  /* Match anchors at newlines.  */
705  re_comp_buf.newline_anchor = 1;
706
707  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708
709  if (!ret)
710    return NULL;
711
712  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
713  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
714}
715
716#ifdef _LIBC
717libc_freeres_fn (free_mem)
718{
719  __regfree (&re_comp_buf);
720}
721#endif
722
723#endif /* _REGEX_RE_COMP */
724
725/* Internal entry point.
726   Compile the regular expression PATTERN, whose length is LENGTH.
727   SYNTAX indicate regular expression's syntax.  */
728
729static reg_errcode_t
730re_compile_internal (regex_t *preg, const char * pattern, size_t length,
731		     reg_syntax_t syntax)
732{
733  reg_errcode_t err = REG_NOERROR;
734  re_dfa_t *dfa;
735  re_string_t regexp;
736
737  /* Initialize the pattern buffer.  */
738  preg->fastmap_accurate = 0;
739  preg->syntax = syntax;
740  preg->not_bol = preg->not_eol = 0;
741  preg->used = 0;
742  preg->re_nsub = 0;
743  preg->can_be_null = 0;
744  preg->regs_allocated = REGS_UNALLOCATED;
745
746  /* Initialize the dfa.  */
747  dfa = (re_dfa_t *) preg->buffer;
748  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
749    {
750      /* If zero allocated, but buffer is non-null, try to realloc
751	 enough space.  This loses if buffer's address is bogus, but
752	 that is the user's responsibility.  If ->buffer is NULL this
753	 is a simple allocation.  */
754      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
755      if (dfa == NULL)
756	return REG_ESPACE;
757      preg->allocated = sizeof (re_dfa_t);
758      preg->buffer = (unsigned char *) dfa;
759    }
760  preg->used = sizeof (re_dfa_t);
761
762  err = init_dfa (dfa, length);
763  if (BE (err != REG_NOERROR, 0))
764    {
765      free_dfa_content (dfa);
766      preg->buffer = NULL;
767      preg->allocated = 0;
768      return err;
769    }
770#ifdef DEBUG
771  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
772  dfa->re_str = re_malloc (char, length + 1);
773  strncpy (dfa->re_str, pattern, length + 1);
774#endif
775
776  __libc_lock_init (dfa->lock);
777
778  err = re_string_construct (&regexp, pattern, length, preg->translate,
779			     syntax & RE_ICASE, dfa);
780  if (BE (err != REG_NOERROR, 0))
781    {
782    re_compile_internal_free_return:
783      free_workarea_compile (preg);
784      re_string_destruct (&regexp);
785      free_dfa_content (dfa);
786      preg->buffer = NULL;
787      preg->allocated = 0;
788      return err;
789    }
790
791  /* Parse the regular expression, and build a structure tree.  */
792  preg->re_nsub = 0;
793  dfa->str_tree = parse (&regexp, preg, syntax, &err);
794  if (BE (dfa->str_tree == NULL, 0))
795    goto re_compile_internal_free_return;
796
797  /* Analyze the tree and create the nfa.  */
798  err = analyze (preg);
799  if (BE (err != REG_NOERROR, 0))
800    goto re_compile_internal_free_return;
801
802#ifdef RE_ENABLE_I18N
803  /* If possible, do searching in single byte encoding to speed things up.  */
804  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
805    optimize_utf8 (dfa);
806#endif
807
808  /* Then create the initial state of the dfa.  */
809  err = create_initial_state (dfa);
810
811  /* Release work areas.  */
812  free_workarea_compile (preg);
813  re_string_destruct (&regexp);
814
815  if (BE (err != REG_NOERROR, 0))
816    {
817      free_dfa_content (dfa);
818      preg->buffer = NULL;
819      preg->allocated = 0;
820    }
821
822  return err;
823}
824
825/* Initialize DFA.  We use the length of the regular expression PAT_LEN
826   as the initial length of some arrays.  */
827
828static reg_errcode_t
829init_dfa (re_dfa_t *dfa, size_t pat_len)
830{
831  __re_size_t table_size;
832#ifdef RE_ENABLE_I18N
833  size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
834#else
835  size_t max_i18n_object_size = 0;
836#endif
837  size_t max_object_size =
838    MAX (sizeof (struct re_state_table_entry),
839	 MAX (sizeof (re_token_t),
840	      MAX (sizeof (re_node_set),
841		   MAX (sizeof (regmatch_t),
842			max_i18n_object_size))));
843
844  memset (dfa, '\0', sizeof (re_dfa_t));
845
846  /* Force allocation of str_tree_storage the first time.  */
847  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
848
849  /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
850     calculation below, and for similar doubling calculations
851     elsewhere.  And it's <= rather than <, because some of the
852     doubling calculations add 1 afterwards.  */
853  if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
854    return REG_ESPACE;
855
856  dfa->nodes_alloc = pat_len + 1;
857  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
858
859  /*  table_size = 2 ^ ceil(log pat_len) */
860  for (table_size = 1; ; table_size <<= 1)
861    if (table_size > pat_len)
862      break;
863
864  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
865  dfa->state_hash_mask = table_size - 1;
866
867  dfa->mb_cur_max = MB_CUR_MAX;
868#ifdef _LIBC
869  if (dfa->mb_cur_max == 6
870      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
871    dfa->is_utf8 = 1;
872  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
873		       != 0);
874#else
875  if (strcmp (locale_charset (), "UTF-8") == 0)
876    dfa->is_utf8 = 1;
877
878  /* We check exhaustively in the loop below if this charset is a
879     superset of ASCII.  */
880  dfa->map_notascii = 0;
881#endif
882
883#ifdef RE_ENABLE_I18N
884  if (dfa->mb_cur_max > 1)
885    {
886      if (dfa->is_utf8)
887	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
888      else
889	{
890	  int i, j, ch;
891
892	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
893	  if (BE (dfa->sb_char == NULL, 0))
894	    return REG_ESPACE;
895
896	  /* Set the bits corresponding to single byte chars.  */
897	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
898	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
899	      {
900		wint_t wch = __btowc (ch);
901		if (wch != WEOF)
902		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
903# ifndef _LIBC
904		if (isascii (ch) && wch != ch)
905		  dfa->map_notascii = 1;
906# endif
907	      }
908	}
909    }
910#endif
911
912  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
913    return REG_ESPACE;
914  return REG_NOERROR;
915}
916
917/* Initialize WORD_CHAR table, which indicate which character is
918   "word".  In this case "word" means that it is the word construction
919   character used by some operators like "\<", "\>", etc.  */
920
921static void
922internal_function
923init_word_char (re_dfa_t *dfa)
924{
925  int i, j, ch;
926  dfa->word_ops_used = 1;
927  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
928    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
929      if (isalnum (ch) || ch == '_')
930	dfa->word_char[i] |= (bitset_word_t) 1 << j;
931}
932
933/* Free the work area which are only used while compiling.  */
934
935static void
936free_workarea_compile (regex_t *preg)
937{
938  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
939  bin_tree_storage_t *storage, *next;
940  for (storage = dfa->str_tree_storage; storage; storage = next)
941    {
942      next = storage->next;
943      re_free (storage);
944    }
945  dfa->str_tree_storage = NULL;
946  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
947  dfa->str_tree = NULL;
948  re_free (dfa->org_indices);
949  dfa->org_indices = NULL;
950}
951
952/* Create initial states for all contexts.  */
953
954static reg_errcode_t
955create_initial_state (re_dfa_t *dfa)
956{
957  Idx first, i;
958  reg_errcode_t err;
959  re_node_set init_nodes;
960
961  /* Initial states have the epsilon closure of the node which is
962     the first node of the regular expression.  */
963  first = dfa->str_tree->first->node_idx;
964  dfa->init_node = first;
965  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
966  if (BE (err != REG_NOERROR, 0))
967    return err;
968
969  /* The back-references which are in initial states can epsilon transit,
970     since in this case all of the subexpressions can be null.
971     Then we add epsilon closures of the nodes which are the next nodes of
972     the back-references.  */
973  if (dfa->nbackref > 0)
974    for (i = 0; i < init_nodes.nelem; ++i)
975      {
976	Idx node_idx = init_nodes.elems[i];
977	re_token_type_t type = dfa->nodes[node_idx].type;
978
979	Idx clexp_idx;
980	if (type != OP_BACK_REF)
981	  continue;
982	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
983	  {
984	    re_token_t *clexp_node;
985	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
986	    if (clexp_node->type == OP_CLOSE_SUBEXP
987		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
988	      break;
989	  }
990	if (clexp_idx == init_nodes.nelem)
991	  continue;
992
993	if (type == OP_BACK_REF)
994	  {
995	    Idx dest_idx = dfa->edests[node_idx].elems[0];
996	    if (!re_node_set_contains (&init_nodes, dest_idx))
997	      {
998		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
999		i = 0;
1000	      }
1001	  }
1002      }
1003
1004  /* It must be the first time to invoke acquire_state.  */
1005  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1006  /* We don't check ERR here, since the initial state must not be NULL.  */
1007  if (BE (dfa->init_state == NULL, 0))
1008    return err;
1009  if (dfa->init_state->has_constraint)
1010    {
1011      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1012						       CONTEXT_WORD);
1013      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1014						     CONTEXT_NEWLINE);
1015      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1016							 &init_nodes,
1017							 CONTEXT_NEWLINE
1018							 | CONTEXT_BEGBUF);
1019      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1020	      || dfa->init_state_begbuf == NULL, 0))
1021	return err;
1022    }
1023  else
1024    dfa->init_state_word = dfa->init_state_nl
1025      = dfa->init_state_begbuf = dfa->init_state;
1026
1027  re_node_set_free (&init_nodes);
1028  return REG_NOERROR;
1029}
1030
1031#ifdef RE_ENABLE_I18N
1032/* If it is possible to do searching in single byte encoding instead of UTF-8
1033   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1034   DFA nodes where needed.  */
1035
1036static void
1037optimize_utf8 (re_dfa_t *dfa)
1038{
1039  Idx node;
1040  int i;
1041  bool mb_chars = false;
1042  bool has_period = false;
1043
1044  for (node = 0; node < dfa->nodes_len; ++node)
1045    switch (dfa->nodes[node].type)
1046      {
1047      case CHARACTER:
1048	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1049	  mb_chars = true;
1050	break;
1051      case ANCHOR:
1052	switch (dfa->nodes[node].opr.ctx_type)
1053	  {
1054	  case LINE_FIRST:
1055	  case LINE_LAST:
1056	  case BUF_FIRST:
1057	  case BUF_LAST:
1058	    break;
1059	  default:
1060	    /* Word anchors etc. cannot be handled.  */
1061	    return;
1062	  }
1063	break;
1064      case OP_PERIOD:
1065        has_period = true;
1066        break;
1067      case OP_BACK_REF:
1068      case OP_ALT:
1069      case END_OF_RE:
1070      case OP_DUP_ASTERISK:
1071      case OP_OPEN_SUBEXP:
1072      case OP_CLOSE_SUBEXP:
1073	break;
1074      case COMPLEX_BRACKET:
1075	return;
1076      case SIMPLE_BRACKET:
1077	/* Just double check.  */
1078	{
1079	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1080			? 0
1081			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1082	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1083	    {
1084	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1085		return;
1086	      rshift = 0;
1087	    }
1088	}
1089	break;
1090      default:
1091	abort ();
1092      }
1093
1094  if (mb_chars || has_period)
1095    for (node = 0; node < dfa->nodes_len; ++node)
1096      {
1097	if (dfa->nodes[node].type == CHARACTER
1098	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1099	  dfa->nodes[node].mb_partial = 0;
1100	else if (dfa->nodes[node].type == OP_PERIOD)
1101	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1102      }
1103
1104  /* The search can be in single byte locale.  */
1105  dfa->mb_cur_max = 1;
1106  dfa->is_utf8 = 0;
1107  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1108}
1109#endif
1110
1111/* Analyze the structure tree, and calculate "first", "next", "edest",
1112   "eclosure", and "inveclosure".  */
1113
1114static reg_errcode_t
1115analyze (regex_t *preg)
1116{
1117  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1118  reg_errcode_t ret;
1119
1120  /* Allocate arrays.  */
1121  dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1122  dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1123  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1124  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1125  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1126	  || dfa->eclosures == NULL, 0))
1127    return REG_ESPACE;
1128
1129  dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1130  if (dfa->subexp_map != NULL)
1131    {
1132      Idx i;
1133      for (i = 0; i < preg->re_nsub; i++)
1134	dfa->subexp_map[i] = i;
1135      preorder (dfa->str_tree, optimize_subexps, dfa);
1136      for (i = 0; i < preg->re_nsub; i++)
1137	if (dfa->subexp_map[i] != i)
1138	  break;
1139      if (i == preg->re_nsub)
1140	{
1141	  free (dfa->subexp_map);
1142	  dfa->subexp_map = NULL;
1143	}
1144    }
1145
1146  ret = postorder (dfa->str_tree, lower_subexps, preg);
1147  if (BE (ret != REG_NOERROR, 0))
1148    return ret;
1149  ret = postorder (dfa->str_tree, calc_first, dfa);
1150  if (BE (ret != REG_NOERROR, 0))
1151    return ret;
1152  preorder (dfa->str_tree, calc_next, dfa);
1153  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1154  if (BE (ret != REG_NOERROR, 0))
1155    return ret;
1156  ret = calc_eclosure (dfa);
1157  if (BE (ret != REG_NOERROR, 0))
1158    return ret;
1159
1160  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1161     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1162  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1163      || dfa->nbackref)
1164    {
1165      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1166      if (BE (dfa->inveclosures == NULL, 0))
1167        return REG_ESPACE;
1168      ret = calc_inveclosure (dfa);
1169    }
1170
1171  return ret;
1172}
1173
1174/* Our parse trees are very unbalanced, so we cannot use a stack to
1175   implement parse tree visits.  Instead, we use parent pointers and
1176   some hairy code in these two functions.  */
1177static reg_errcode_t
1178postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1179	   void *extra)
1180{
1181  bin_tree_t *node, *prev;
1182
1183  for (node = root; ; )
1184    {
1185      /* Descend down the tree, preferably to the left (or to the right
1186	 if that's the only child).  */
1187      while (node->left || node->right)
1188	if (node->left)
1189          node = node->left;
1190        else
1191          node = node->right;
1192
1193      do
1194	{
1195	  reg_errcode_t err = fn (extra, node);
1196	  if (BE (err != REG_NOERROR, 0))
1197	    return err;
1198          if (node->parent == NULL)
1199	    return REG_NOERROR;
1200	  prev = node;
1201	  node = node->parent;
1202	}
1203      /* Go up while we have a node that is reached from the right.  */
1204      while (node->right == prev || node->right == NULL);
1205      node = node->right;
1206    }
1207}
1208
1209static reg_errcode_t
1210preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211	  void *extra)
1212{
1213  bin_tree_t *node;
1214
1215  for (node = root; ; )
1216    {
1217      reg_errcode_t err = fn (extra, node);
1218      if (BE (err != REG_NOERROR, 0))
1219	return err;
1220
1221      /* Go to the left node, or up and to the right.  */
1222      if (node->left)
1223	node = node->left;
1224      else
1225	{
1226	  bin_tree_t *prev = NULL;
1227	  while (node->right == prev || node->right == NULL)
1228	    {
1229	      prev = node;
1230	      node = node->parent;
1231	      if (!node)
1232	        return REG_NOERROR;
1233	    }
1234	  node = node->right;
1235	}
1236    }
1237}
1238
1239/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1240   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1241   backreferences as well.  Requires a preorder visit.  */
1242static reg_errcode_t
1243optimize_subexps (void *extra, bin_tree_t *node)
1244{
1245  re_dfa_t *dfa = (re_dfa_t *) extra;
1246
1247  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1248    {
1249      int idx = node->token.opr.idx;
1250      node->token.opr.idx = dfa->subexp_map[idx];
1251      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1252    }
1253
1254  else if (node->token.type == SUBEXP
1255           && node->left && node->left->token.type == SUBEXP)
1256    {
1257      Idx other_idx = node->left->token.opr.idx;
1258
1259      node->left = node->left->left;
1260      if (node->left)
1261        node->left->parent = node;
1262
1263      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1264      if (other_idx < BITSET_WORD_BITS)
1265	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1266    }
1267
1268  return REG_NOERROR;
1269}
1270
1271/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1272   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1273static reg_errcode_t
1274lower_subexps (void *extra, bin_tree_t *node)
1275{
1276  regex_t *preg = (regex_t *) extra;
1277  reg_errcode_t err = REG_NOERROR;
1278
1279  if (node->left && node->left->token.type == SUBEXP)
1280    {
1281      node->left = lower_subexp (&err, preg, node->left);
1282      if (node->left)
1283	node->left->parent = node;
1284    }
1285  if (node->right && node->right->token.type == SUBEXP)
1286    {
1287      node->right = lower_subexp (&err, preg, node->right);
1288      if (node->right)
1289	node->right->parent = node;
1290    }
1291
1292  return err;
1293}
1294
1295static bin_tree_t *
1296lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1297{
1298  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1299  bin_tree_t *body = node->left;
1300  bin_tree_t *op, *cls, *tree1, *tree;
1301
1302  if (preg->no_sub
1303      /* We do not optimize empty subexpressions, because otherwise we may
1304	 have bad CONCAT nodes with NULL children.  This is obviously not
1305	 very common, so we do not lose much.  An example that triggers
1306	 this case is the sed "script" /\(\)/x.  */
1307      && node->left != NULL
1308      && (node->token.opr.idx >= BITSET_WORD_BITS
1309	  || !(dfa->used_bkref_map
1310	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1311    return node->left;
1312
1313  /* Convert the SUBEXP node to the concatenation of an
1314     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1315  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1316  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1317  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1318  tree = create_tree (dfa, op, tree1, CONCAT);
1319  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1320    {
1321      *err = REG_ESPACE;
1322      return NULL;
1323    }
1324
1325  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1326  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1327  return tree;
1328}
1329
1330/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1331   nodes.  Requires a postorder visit.  */
1332static reg_errcode_t
1333calc_first (void *extra, bin_tree_t *node)
1334{
1335  re_dfa_t *dfa = (re_dfa_t *) extra;
1336  if (node->token.type == CONCAT)
1337    {
1338      node->first = node->left->first;
1339      node->node_idx = node->left->node_idx;
1340    }
1341  else
1342    {
1343      node->first = node;
1344      node->node_idx = re_dfa_add_node (dfa, node->token);
1345      if (BE (node->node_idx == REG_MISSING, 0))
1346        return REG_ESPACE;
1347    }
1348  return REG_NOERROR;
1349}
1350
1351/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1352static reg_errcode_t
1353calc_next (void *extra, bin_tree_t *node)
1354{
1355  switch (node->token.type)
1356    {
1357    case OP_DUP_ASTERISK:
1358      node->left->next = node;
1359      break;
1360    case CONCAT:
1361      node->left->next = node->right->first;
1362      node->right->next = node->next;
1363      break;
1364    default:
1365      if (node->left)
1366	node->left->next = node->next;
1367      if (node->right)
1368        node->right->next = node->next;
1369      break;
1370    }
1371  return REG_NOERROR;
1372}
1373
1374/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1375static reg_errcode_t
1376link_nfa_nodes (void *extra, bin_tree_t *node)
1377{
1378  re_dfa_t *dfa = (re_dfa_t *) extra;
1379  Idx idx = node->node_idx;
1380  reg_errcode_t err = REG_NOERROR;
1381
1382  switch (node->token.type)
1383    {
1384    case CONCAT:
1385      break;
1386
1387    case END_OF_RE:
1388      assert (node->next == NULL);
1389      break;
1390
1391    case OP_DUP_ASTERISK:
1392    case OP_ALT:
1393      {
1394	Idx left, right;
1395	dfa->has_plural_match = 1;
1396	if (node->left != NULL)
1397	  left = node->left->first->node_idx;
1398	else
1399	  left = node->next->node_idx;
1400	if (node->right != NULL)
1401	  right = node->right->first->node_idx;
1402	else
1403	  right = node->next->node_idx;
1404	assert (REG_VALID_INDEX (left));
1405	assert (REG_VALID_INDEX (right));
1406	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1407      }
1408      break;
1409
1410    case ANCHOR:
1411    case OP_OPEN_SUBEXP:
1412    case OP_CLOSE_SUBEXP:
1413      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1414      break;
1415
1416    case OP_BACK_REF:
1417      dfa->nexts[idx] = node->next->node_idx;
1418      if (node->token.type == OP_BACK_REF)
1419	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1420      break;
1421
1422    default:
1423      assert (!IS_EPSILON_NODE (node->token.type));
1424      dfa->nexts[idx] = node->next->node_idx;
1425      break;
1426    }
1427
1428  return err;
1429}
1430
1431/* Duplicate the epsilon closure of the node ROOT_NODE.
1432   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1433   to their own constraint.  */
1434
1435static reg_errcode_t
1436internal_function
1437duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1438			Idx root_node, unsigned int init_constraint)
1439{
1440  Idx org_node, clone_node;
1441  bool ok;
1442  unsigned int constraint = init_constraint;
1443  for (org_node = top_org_node, clone_node = top_clone_node;;)
1444    {
1445      Idx org_dest, clone_dest;
1446      if (dfa->nodes[org_node].type == OP_BACK_REF)
1447	{
1448	  /* If the back reference epsilon-transit, its destination must
1449	     also have the constraint.  Then duplicate the epsilon closure
1450	     of the destination of the back reference, and store it in
1451	     edests of the back reference.  */
1452	  org_dest = dfa->nexts[org_node];
1453	  re_node_set_empty (dfa->edests + clone_node);
1454	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1455	  if (BE (clone_dest == REG_MISSING, 0))
1456	    return REG_ESPACE;
1457	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1458	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1459	  if (BE (! ok, 0))
1460	    return REG_ESPACE;
1461	}
1462      else if (dfa->edests[org_node].nelem == 0)
1463	{
1464	  /* In case of the node can't epsilon-transit, don't duplicate the
1465	     destination and store the original destination as the
1466	     destination of the node.  */
1467	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1468	  break;
1469	}
1470      else if (dfa->edests[org_node].nelem == 1)
1471	{
1472	  /* In case of the node can epsilon-transit, and it has only one
1473	     destination.  */
1474	  org_dest = dfa->edests[org_node].elems[0];
1475	  re_node_set_empty (dfa->edests + clone_node);
1476	  if (dfa->nodes[org_node].type == ANCHOR)
1477	    {
1478	      /* In case of the node has another constraint, append it.  */
1479	      if (org_node == root_node && clone_node != org_node)
1480		{
1481		  /* ...but if the node is root_node itself, it means the
1482		     epsilon closure have a loop, then tie it to the
1483		     destination of the root_node.  */
1484		  ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1485		  if (BE (! ok, 0))
1486		    return REG_ESPACE;
1487		  break;
1488		}
1489	      constraint |= dfa->nodes[org_node].opr.ctx_type;
1490	    }
1491	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1492	  if (BE (clone_dest == REG_MISSING, 0))
1493	    return REG_ESPACE;
1494	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495	  if (BE (! ok, 0))
1496	    return REG_ESPACE;
1497	}
1498      else /* dfa->edests[org_node].nelem == 2 */
1499	{
1500	  /* In case of the node can epsilon-transit, and it has two
1501	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1502	  org_dest = dfa->edests[org_node].elems[0];
1503	  re_node_set_empty (dfa->edests + clone_node);
1504	  /* Search for a duplicated node which satisfies the constraint.  */
1505	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1506	  if (clone_dest == REG_MISSING)
1507	    {
1508	      /* There are no such a duplicated node, create a new one.  */
1509	      reg_errcode_t err;
1510	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1511	      if (BE (clone_dest == REG_MISSING, 0))
1512		return REG_ESPACE;
1513	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1514	      if (BE (! ok, 0))
1515		return REG_ESPACE;
1516	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1517					    root_node, constraint);
1518	      if (BE (err != REG_NOERROR, 0))
1519		return err;
1520	    }
1521	  else
1522	    {
1523	      /* There are a duplicated node which satisfy the constraint,
1524		 use it to avoid infinite loop.  */
1525	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526	      if (BE (! ok, 0))
1527		return REG_ESPACE;
1528	    }
1529
1530	  org_dest = dfa->edests[org_node].elems[1];
1531	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1532	  if (BE (clone_dest == REG_MISSING, 0))
1533	    return REG_ESPACE;
1534	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535	  if (BE (! ok, 0))
1536	    return REG_ESPACE;
1537	}
1538      org_node = org_dest;
1539      clone_node = clone_dest;
1540    }
1541  return REG_NOERROR;
1542}
1543
1544/* Search for a node which is duplicated from the node ORG_NODE, and
1545   satisfies the constraint CONSTRAINT.  */
1546
1547static Idx
1548search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1549			unsigned int constraint)
1550{
1551  Idx idx;
1552  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1553    {
1554      if (org_node == dfa->org_indices[idx]
1555	  && constraint == dfa->nodes[idx].constraint)
1556	return idx; /* Found.  */
1557    }
1558  return REG_MISSING; /* Not found.  */
1559}
1560
1561/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1562   Return the index of the new node, or REG_MISSING if insufficient storage is
1563   available.  */
1564
1565static Idx
1566duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1567{
1568  Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1569  if (BE (dup_idx != REG_MISSING, 1))
1570    {
1571      dfa->nodes[dup_idx].constraint = constraint;
1572      if (dfa->nodes[org_idx].type == ANCHOR)
1573	dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1574      dfa->nodes[dup_idx].duplicated = 1;
1575
1576      /* Store the index of the original node.  */
1577      dfa->org_indices[dup_idx] = org_idx;
1578    }
1579  return dup_idx;
1580}
1581
1582static reg_errcode_t
1583calc_inveclosure (re_dfa_t *dfa)
1584{
1585  Idx src, idx;
1586  bool ok;
1587  for (idx = 0; idx < dfa->nodes_len; ++idx)
1588    re_node_set_init_empty (dfa->inveclosures + idx);
1589
1590  for (src = 0; src < dfa->nodes_len; ++src)
1591    {
1592      Idx *elems = dfa->eclosures[src].elems;
1593      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1594	{
1595	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1596	  if (BE (! ok, 0))
1597	    return REG_ESPACE;
1598	}
1599    }
1600
1601  return REG_NOERROR;
1602}
1603
1604/* Calculate "eclosure" for all the node in DFA.  */
1605
1606static reg_errcode_t
1607calc_eclosure (re_dfa_t *dfa)
1608{
1609  Idx node_idx;
1610  bool incomplete;
1611#ifdef DEBUG
1612  assert (dfa->nodes_len > 0);
1613#endif
1614  incomplete = false;
1615  /* For each nodes, calculate epsilon closure.  */
1616  for (node_idx = 0; ; ++node_idx)
1617    {
1618      reg_errcode_t err;
1619      re_node_set eclosure_elem;
1620      if (node_idx == dfa->nodes_len)
1621	{
1622	  if (!incomplete)
1623	    break;
1624	  incomplete = false;
1625	  node_idx = 0;
1626	}
1627
1628#ifdef DEBUG
1629      assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1630#endif
1631
1632      /* If we have already calculated, skip it.  */
1633      if (dfa->eclosures[node_idx].nelem != 0)
1634	continue;
1635      /* Calculate epsilon closure of `node_idx'.  */
1636      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1637      if (BE (err != REG_NOERROR, 0))
1638	return err;
1639
1640      if (dfa->eclosures[node_idx].nelem == 0)
1641	{
1642	  incomplete = true;
1643	  re_node_set_free (&eclosure_elem);
1644	}
1645    }
1646  return REG_NOERROR;
1647}
1648
1649/* Calculate epsilon closure of NODE.  */
1650
1651static reg_errcode_t
1652calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1653{
1654  reg_errcode_t err;
1655  unsigned int constraint;
1656  Idx i;
1657  bool incomplete;
1658  bool ok;
1659  re_node_set eclosure;
1660  incomplete = false;
1661  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662  if (BE (err != REG_NOERROR, 0))
1663    return err;
1664
1665  /* This indicates that we are calculating this node now.
1666     We reference this value to avoid infinite loop.  */
1667  dfa->eclosures[node].nelem = REG_MISSING;
1668
1669  constraint = ((dfa->nodes[node].type == ANCHOR)
1670		? dfa->nodes[node].opr.ctx_type : 0);
1671  /* If the current node has constraints, duplicate all nodes.
1672     Since they must inherit the constraints.  */
1673  if (constraint
1674      && dfa->edests[node].nelem
1675      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1676    {
1677      err = duplicate_node_closure (dfa, node, node, node, constraint);
1678      if (BE (err != REG_NOERROR, 0))
1679	return err;
1680    }
1681
1682  /* Expand each epsilon destination nodes.  */
1683  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1684    for (i = 0; i < dfa->edests[node].nelem; ++i)
1685      {
1686	re_node_set eclosure_elem;
1687	Idx edest = dfa->edests[node].elems[i];
1688	/* If calculating the epsilon closure of `edest' is in progress,
1689	   return intermediate result.  */
1690	if (dfa->eclosures[edest].nelem == REG_MISSING)
1691	  {
1692	    incomplete = true;
1693	    continue;
1694	  }
1695	/* If we haven't calculated the epsilon closure of `edest' yet,
1696	   calculate now. Otherwise use calculated epsilon closure.  */
1697	if (dfa->eclosures[edest].nelem == 0)
1698	  {
1699	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1700	    if (BE (err != REG_NOERROR, 0))
1701	      return err;
1702	  }
1703	else
1704	  eclosure_elem = dfa->eclosures[edest];
1705	/* Merge the epsilon closure of `edest'.  */
1706	re_node_set_merge (&eclosure, &eclosure_elem);
1707	/* If the epsilon closure of `edest' is incomplete,
1708	   the epsilon closure of this node is also incomplete.  */
1709	if (dfa->eclosures[edest].nelem == 0)
1710	  {
1711	    incomplete = true;
1712	    re_node_set_free (&eclosure_elem);
1713	  }
1714      }
1715
1716  /* Epsilon closures include itself.  */
1717  ok = re_node_set_insert (&eclosure, node);
1718  if (BE (! ok, 0))
1719    return REG_ESPACE;
1720  if (incomplete && !root)
1721    dfa->eclosures[node].nelem = 0;
1722  else
1723    dfa->eclosures[node] = eclosure;
1724  *new_set = eclosure;
1725  return REG_NOERROR;
1726}
1727
1728/* Functions for token which are used in the parser.  */
1729
1730/* Fetch a token from INPUT.
1731   We must not use this function inside bracket expressions.  */
1732
1733static void
1734internal_function
1735fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1736{
1737  re_string_skip_bytes (input, peek_token (result, input, syntax));
1738}
1739
1740/* Peek a token from INPUT, and return the length of the token.
1741   We must not use this function inside bracket expressions.  */
1742
1743static int
1744internal_function
1745peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1746{
1747  unsigned char c;
1748
1749  if (re_string_eoi (input))
1750    {
1751      token->type = END_OF_RE;
1752      return 0;
1753    }
1754
1755  c = re_string_peek_byte (input, 0);
1756  token->opr.c = c;
1757
1758  token->word_char = 0;
1759#ifdef RE_ENABLE_I18N
1760  token->mb_partial = 0;
1761  if (input->mb_cur_max > 1 &&
1762      !re_string_first_byte (input, re_string_cur_idx (input)))
1763    {
1764      token->type = CHARACTER;
1765      token->mb_partial = 1;
1766      return 1;
1767    }
1768#endif
1769  if (c == '\\')
1770    {
1771      unsigned char c2;
1772      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1773	{
1774	  token->type = BACK_SLASH;
1775	  return 1;
1776	}
1777
1778      c2 = re_string_peek_byte_case (input, 1);
1779      token->opr.c = c2;
1780      token->type = CHARACTER;
1781#ifdef RE_ENABLE_I18N
1782      if (input->mb_cur_max > 1)
1783	{
1784	  wint_t wc = re_string_wchar_at (input,
1785					  re_string_cur_idx (input) + 1);
1786	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1787	}
1788      else
1789#endif
1790	token->word_char = IS_WORD_CHAR (c2) != 0;
1791
1792      switch (c2)
1793	{
1794	case '|':
1795	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1796	    token->type = OP_ALT;
1797	  break;
1798	case '1': case '2': case '3': case '4': case '5':
1799	case '6': case '7': case '8': case '9':
1800	  if (!(syntax & RE_NO_BK_REFS))
1801	    {
1802	      token->type = OP_BACK_REF;
1803	      token->opr.idx = c2 - '1';
1804	    }
1805	  break;
1806	case '<':
1807	  if (!(syntax & RE_NO_GNU_OPS))
1808	    {
1809	      token->type = ANCHOR;
1810	      token->opr.ctx_type = WORD_FIRST;
1811	    }
1812	  break;
1813	case '>':
1814	  if (!(syntax & RE_NO_GNU_OPS))
1815	    {
1816	      token->type = ANCHOR;
1817	      token->opr.ctx_type = WORD_LAST;
1818	    }
1819	  break;
1820	case 'b':
1821	  if (!(syntax & RE_NO_GNU_OPS))
1822	    {
1823	      token->type = ANCHOR;
1824	      token->opr.ctx_type = WORD_DELIM;
1825	    }
1826	  break;
1827	case 'B':
1828	  if (!(syntax & RE_NO_GNU_OPS))
1829	    {
1830	      token->type = ANCHOR;
1831	      token->opr.ctx_type = NOT_WORD_DELIM;
1832	    }
1833	  break;
1834	case 'w':
1835	  if (!(syntax & RE_NO_GNU_OPS))
1836	    token->type = OP_WORD;
1837	  break;
1838	case 'W':
1839	  if (!(syntax & RE_NO_GNU_OPS))
1840	    token->type = OP_NOTWORD;
1841	  break;
1842	case 's':
1843	  if (!(syntax & RE_NO_GNU_OPS))
1844	    token->type = OP_SPACE;
1845	  break;
1846	case 'S':
1847	  if (!(syntax & RE_NO_GNU_OPS))
1848	    token->type = OP_NOTSPACE;
1849	  break;
1850	case '`':
1851	  if (!(syntax & RE_NO_GNU_OPS))
1852	    {
1853	      token->type = ANCHOR;
1854	      token->opr.ctx_type = BUF_FIRST;
1855	    }
1856	  break;
1857	case '\'':
1858	  if (!(syntax & RE_NO_GNU_OPS))
1859	    {
1860	      token->type = ANCHOR;
1861	      token->opr.ctx_type = BUF_LAST;
1862	    }
1863	  break;
1864	case '(':
1865	  if (!(syntax & RE_NO_BK_PARENS))
1866	    token->type = OP_OPEN_SUBEXP;
1867	  break;
1868	case ')':
1869	  if (!(syntax & RE_NO_BK_PARENS))
1870	    token->type = OP_CLOSE_SUBEXP;
1871	  break;
1872	case '+':
1873	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1874	    token->type = OP_DUP_PLUS;
1875	  break;
1876	case '?':
1877	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1878	    token->type = OP_DUP_QUESTION;
1879	  break;
1880	case '{':
1881	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1882	    token->type = OP_OPEN_DUP_NUM;
1883	  break;
1884	case '}':
1885	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1886	    token->type = OP_CLOSE_DUP_NUM;
1887	  break;
1888	default:
1889	  break;
1890	}
1891      return 2;
1892    }
1893
1894  token->type = CHARACTER;
1895#ifdef RE_ENABLE_I18N
1896  if (input->mb_cur_max > 1)
1897    {
1898      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1899      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1900    }
1901  else
1902#endif
1903    token->word_char = IS_WORD_CHAR (token->opr.c);
1904
1905  switch (c)
1906    {
1907    case '\n':
1908      if (syntax & RE_NEWLINE_ALT)
1909	token->type = OP_ALT;
1910      break;
1911    case '|':
1912      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1913	token->type = OP_ALT;
1914      break;
1915    case '*':
1916      token->type = OP_DUP_ASTERISK;
1917      break;
1918    case '+':
1919      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1920	token->type = OP_DUP_PLUS;
1921      break;
1922    case '?':
1923      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1924	token->type = OP_DUP_QUESTION;
1925      break;
1926    case '{':
1927      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1928	token->type = OP_OPEN_DUP_NUM;
1929      break;
1930    case '}':
1931      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1932	token->type = OP_CLOSE_DUP_NUM;
1933      break;
1934    case '(':
1935      if (syntax & RE_NO_BK_PARENS)
1936	token->type = OP_OPEN_SUBEXP;
1937      break;
1938    case ')':
1939      if (syntax & RE_NO_BK_PARENS)
1940	token->type = OP_CLOSE_SUBEXP;
1941      break;
1942    case '[':
1943      token->type = OP_OPEN_BRACKET;
1944      break;
1945    case '.':
1946      token->type = OP_PERIOD;
1947      break;
1948    case '^':
1949      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1950	  re_string_cur_idx (input) != 0)
1951	{
1952	  char prev = re_string_peek_byte (input, -1);
1953	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1954	    break;
1955	}
1956      token->type = ANCHOR;
1957      token->opr.ctx_type = LINE_FIRST;
1958      break;
1959    case '$':
1960      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1961	  re_string_cur_idx (input) + 1 != re_string_length (input))
1962	{
1963	  re_token_t next;
1964	  re_string_skip_bytes (input, 1);
1965	  peek_token (&next, input, syntax);
1966	  re_string_skip_bytes (input, -1);
1967	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1968	    break;
1969	}
1970      token->type = ANCHOR;
1971      token->opr.ctx_type = LINE_LAST;
1972      break;
1973    default:
1974      break;
1975    }
1976  return 1;
1977}
1978
1979/* Peek a token from INPUT, and return the length of the token.
1980   We must not use this function out of bracket expressions.  */
1981
1982static int
1983internal_function
1984peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1985{
1986  unsigned char c;
1987  if (re_string_eoi (input))
1988    {
1989      token->type = END_OF_RE;
1990      return 0;
1991    }
1992  c = re_string_peek_byte (input, 0);
1993  token->opr.c = c;
1994
1995#ifdef RE_ENABLE_I18N
1996  if (input->mb_cur_max > 1 &&
1997      !re_string_first_byte (input, re_string_cur_idx (input)))
1998    {
1999      token->type = CHARACTER;
2000      return 1;
2001    }
2002#endif /* RE_ENABLE_I18N */
2003
2004  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2005      && re_string_cur_idx (input) + 1 < re_string_length (input))
2006    {
2007      /* In this case, '\' escape a character.  */
2008      unsigned char c2;
2009      re_string_skip_bytes (input, 1);
2010      c2 = re_string_peek_byte (input, 0);
2011      token->opr.c = c2;
2012      token->type = CHARACTER;
2013      return 1;
2014    }
2015  if (c == '[') /* '[' is a special char in a bracket exps.  */
2016    {
2017      unsigned char c2;
2018      int token_len;
2019      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2020	c2 = re_string_peek_byte (input, 1);
2021      else
2022	c2 = 0;
2023      token->opr.c = c2;
2024      token_len = 2;
2025      switch (c2)
2026	{
2027	case '.':
2028	  token->type = OP_OPEN_COLL_ELEM;
2029	  break;
2030	case '=':
2031	  token->type = OP_OPEN_EQUIV_CLASS;
2032	  break;
2033	case ':':
2034	  if (syntax & RE_CHAR_CLASSES)
2035	    {
2036	      token->type = OP_OPEN_CHAR_CLASS;
2037	      break;
2038	    }
2039	  /* else fall through.  */
2040	default:
2041	  token->type = CHARACTER;
2042	  token->opr.c = c;
2043	  token_len = 1;
2044	  break;
2045	}
2046      return token_len;
2047    }
2048  switch (c)
2049    {
2050    case '-':
2051      token->type = OP_CHARSET_RANGE;
2052      break;
2053    case ']':
2054      token->type = OP_CLOSE_BRACKET;
2055      break;
2056    case '^':
2057      token->type = OP_NON_MATCH_LIST;
2058      break;
2059    default:
2060      token->type = CHARACTER;
2061    }
2062  return 1;
2063}
2064
2065/* Functions for parser.  */
2066
2067/* Entry point of the parser.
2068   Parse the regular expression REGEXP and return the structure tree.
2069   If an error is occured, ERR is set by error code, and return NULL.
2070   This function build the following tree, from regular expression <reg_exp>:
2071	   CAT
2072	   / \
2073	  /   \
2074   <reg_exp>  EOR
2075
2076   CAT means concatenation.
2077   EOR means end of regular expression.  */
2078
2079static bin_tree_t *
2080parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2081       reg_errcode_t *err)
2082{
2083  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2084  bin_tree_t *tree, *eor, *root;
2085  re_token_t current_token;
2086  dfa->syntax = syntax;
2087  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2088  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2089  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2090    return NULL;
2091  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2092  if (tree != NULL)
2093    root = create_tree (dfa, tree, eor, CONCAT);
2094  else
2095    root = eor;
2096  if (BE (eor == NULL || root == NULL, 0))
2097    {
2098      *err = REG_ESPACE;
2099      return NULL;
2100    }
2101  return root;
2102}
2103
2104/* This function build the following tree, from regular expression
2105   <branch1>|<branch2>:
2106	   ALT
2107	   / \
2108	  /   \
2109   <branch1> <branch2>
2110
2111   ALT means alternative, which represents the operator `|'.  */
2112
2113static bin_tree_t *
2114parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2115	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2116{
2117  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2118  bin_tree_t *tree, *branch = NULL;
2119  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2120  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121    return NULL;
2122
2123  while (token->type == OP_ALT)
2124    {
2125      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126      if (token->type != OP_ALT && token->type != END_OF_RE
2127	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2128	{
2129	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2130	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2131	    return NULL;
2132	}
2133      else
2134	branch = NULL;
2135      tree = create_tree (dfa, tree, branch, OP_ALT);
2136      if (BE (tree == NULL, 0))
2137	{
2138	  *err = REG_ESPACE;
2139	  return NULL;
2140	}
2141    }
2142  return tree;
2143}
2144
2145/* This function build the following tree, from regular expression
2146   <exp1><exp2>:
2147	CAT
2148	/ \
2149       /   \
2150   <exp1> <exp2>
2151
2152   CAT means concatenation.  */
2153
2154static bin_tree_t *
2155parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2156	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2157{
2158  bin_tree_t *tree, *expr;
2159  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2160  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2161  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2162    return NULL;
2163
2164  while (token->type != OP_ALT && token->type != END_OF_RE
2165	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166    {
2167      expr = parse_expression (regexp, preg, token, syntax, nest, err);
2168      if (BE (*err != REG_NOERROR && expr == NULL, 0))
2169	{
2170	  return NULL;
2171	}
2172      if (tree != NULL && expr != NULL)
2173	{
2174	  tree = create_tree (dfa, tree, expr, CONCAT);
2175	  if (tree == NULL)
2176	    {
2177	      *err = REG_ESPACE;
2178	      return NULL;
2179	    }
2180	}
2181      else if (tree == NULL)
2182	tree = expr;
2183      /* Otherwise expr == NULL, we don't need to create new tree.  */
2184    }
2185  return tree;
2186}
2187
2188/* This function build the following tree, from regular expression a*:
2189	 *
2190	 |
2191	 a
2192*/
2193
2194static bin_tree_t *
2195parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2196		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2197{
2198  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2199  bin_tree_t *tree;
2200  switch (token->type)
2201    {
2202    case CHARACTER:
2203      tree = create_token_tree (dfa, NULL, NULL, token);
2204      if (BE (tree == NULL, 0))
2205	{
2206	  *err = REG_ESPACE;
2207	  return NULL;
2208	}
2209#ifdef RE_ENABLE_I18N
2210      if (dfa->mb_cur_max > 1)
2211	{
2212	  while (!re_string_eoi (regexp)
2213		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2214	    {
2215	      bin_tree_t *mbc_remain;
2216	      fetch_token (token, regexp, syntax);
2217	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2218	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2219	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2220		{
2221		  *err = REG_ESPACE;
2222		  return NULL;
2223		}
2224	    }
2225	}
2226#endif
2227      break;
2228    case OP_OPEN_SUBEXP:
2229      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2230      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2231	return NULL;
2232      break;
2233    case OP_OPEN_BRACKET:
2234      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2235      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2236	return NULL;
2237      break;
2238    case OP_BACK_REF:
2239      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2240	{
2241	  *err = REG_ESUBREG;
2242	  return NULL;
2243	}
2244      dfa->used_bkref_map |= 1 << token->opr.idx;
2245      tree = create_token_tree (dfa, NULL, NULL, token);
2246      if (BE (tree == NULL, 0))
2247	{
2248	  *err = REG_ESPACE;
2249	  return NULL;
2250	}
2251      ++dfa->nbackref;
2252      dfa->has_mb_node = 1;
2253      break;
2254    case OP_OPEN_DUP_NUM:
2255      if (syntax & RE_CONTEXT_INVALID_DUP)
2256	{
2257	  *err = REG_BADRPT;
2258	  return NULL;
2259	}
2260      /* FALLTHROUGH */
2261    case OP_DUP_ASTERISK:
2262    case OP_DUP_PLUS:
2263    case OP_DUP_QUESTION:
2264      if (syntax & RE_CONTEXT_INVALID_OPS)
2265	{
2266	  *err = REG_BADRPT;
2267	  return NULL;
2268	}
2269      else if (syntax & RE_CONTEXT_INDEP_OPS)
2270	{
2271	  fetch_token (token, regexp, syntax);
2272	  return parse_expression (regexp, preg, token, syntax, nest, err);
2273	}
2274      /* else fall through  */
2275    case OP_CLOSE_SUBEXP:
2276      if ((token->type == OP_CLOSE_SUBEXP) &&
2277	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2278	{
2279	  *err = REG_ERPAREN;
2280	  return NULL;
2281	}
2282      /* else fall through  */
2283    case OP_CLOSE_DUP_NUM:
2284      /* We treat it as a normal character.  */
2285
2286      /* Then we can these characters as normal characters.  */
2287      token->type = CHARACTER;
2288      /* mb_partial and word_char bits should be initialized already
2289	 by peek_token.  */
2290      tree = create_token_tree (dfa, NULL, NULL, token);
2291      if (BE (tree == NULL, 0))
2292	{
2293	  *err = REG_ESPACE;
2294	  return NULL;
2295	}
2296      break;
2297    case ANCHOR:
2298      if ((token->opr.ctx_type
2299	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2300	  && dfa->word_ops_used == 0)
2301	init_word_char (dfa);
2302      if (token->opr.ctx_type == WORD_DELIM
2303          || token->opr.ctx_type == NOT_WORD_DELIM)
2304	{
2305	  bin_tree_t *tree_first, *tree_last;
2306	  if (token->opr.ctx_type == WORD_DELIM)
2307	    {
2308	      token->opr.ctx_type = WORD_FIRST;
2309	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2310	      token->opr.ctx_type = WORD_LAST;
2311            }
2312          else
2313            {
2314	      token->opr.ctx_type = INSIDE_WORD;
2315	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2316	      token->opr.ctx_type = INSIDE_NOTWORD;
2317            }
2318	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2319	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2320	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2321	    {
2322	      *err = REG_ESPACE;
2323	      return NULL;
2324	    }
2325	}
2326      else
2327	{
2328	  tree = create_token_tree (dfa, NULL, NULL, token);
2329	  if (BE (tree == NULL, 0))
2330	    {
2331	      *err = REG_ESPACE;
2332	      return NULL;
2333	    }
2334	}
2335      /* We must return here, since ANCHORs can't be followed
2336	 by repetition operators.
2337	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2338	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2339      fetch_token (token, regexp, syntax);
2340      return tree;
2341    case OP_PERIOD:
2342      tree = create_token_tree (dfa, NULL, NULL, token);
2343      if (BE (tree == NULL, 0))
2344	{
2345	  *err = REG_ESPACE;
2346	  return NULL;
2347	}
2348      if (dfa->mb_cur_max > 1)
2349	dfa->has_mb_node = 1;
2350      break;
2351    case OP_WORD:
2352    case OP_NOTWORD:
2353      tree = build_charclass_op (dfa, regexp->trans,
2354				 (const unsigned char *) "alnum",
2355				 (const unsigned char *) "_",
2356				 token->type == OP_NOTWORD, err);
2357      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2358	return NULL;
2359      break;
2360    case OP_SPACE:
2361    case OP_NOTSPACE:
2362      tree = build_charclass_op (dfa, regexp->trans,
2363				 (const unsigned char *) "space",
2364				 (const unsigned char *) "",
2365				 token->type == OP_NOTSPACE, err);
2366      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2367	return NULL;
2368      break;
2369    case OP_ALT:
2370    case END_OF_RE:
2371      return NULL;
2372    case BACK_SLASH:
2373      *err = REG_EESCAPE;
2374      return NULL;
2375    default:
2376      /* Must not happen?  */
2377#ifdef DEBUG
2378      assert (0);
2379#endif
2380      return NULL;
2381    }
2382  fetch_token (token, regexp, syntax);
2383
2384  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2385	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2386    {
2387      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2388      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389	return NULL;
2390      /* In BRE consecutive duplications are not allowed.  */
2391      if ((syntax & RE_CONTEXT_INVALID_DUP)
2392	  && (token->type == OP_DUP_ASTERISK
2393	      || token->type == OP_OPEN_DUP_NUM))
2394	{
2395	  *err = REG_BADRPT;
2396	  return NULL;
2397	}
2398    }
2399
2400  return tree;
2401}
2402
2403/* This function build the following tree, from regular expression
2404   (<reg_exp>):
2405	 SUBEXP
2406	    |
2407	<reg_exp>
2408*/
2409
2410static bin_tree_t *
2411parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2412	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2413{
2414  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2415  bin_tree_t *tree;
2416  size_t cur_nsub;
2417  cur_nsub = preg->re_nsub++;
2418
2419  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2420
2421  /* The subexpression may be a null string.  */
2422  if (token->type == OP_CLOSE_SUBEXP)
2423    tree = NULL;
2424  else
2425    {
2426      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2427      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2428        *err = REG_EPAREN;
2429      if (BE (*err != REG_NOERROR, 0))
2430	return NULL;
2431    }
2432
2433  if (cur_nsub <= '9' - '1')
2434    dfa->completed_bkref_map |= 1 << cur_nsub;
2435
2436  tree = create_tree (dfa, tree, NULL, SUBEXP);
2437  if (BE (tree == NULL, 0))
2438    {
2439      *err = REG_ESPACE;
2440      return NULL;
2441    }
2442  tree->token.opr.idx = cur_nsub;
2443  return tree;
2444}
2445
2446/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2447
2448static bin_tree_t *
2449parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2450	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2451{
2452  bin_tree_t *tree = NULL, *old_tree = NULL;
2453  Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2454  re_token_t start_token = *token;
2455
2456  if (token->type == OP_OPEN_DUP_NUM)
2457    {
2458      end = 0;
2459      start = fetch_number (regexp, token, syntax);
2460      if (start == REG_MISSING)
2461	{
2462	  if (token->type == CHARACTER && token->opr.c == ',')
2463	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2464	  else
2465	    {
2466	      *err = REG_BADBR; /* <re>{} is invalid.  */
2467	      return NULL;
2468	    }
2469	}
2470      if (BE (start != REG_ERROR, 1))
2471	{
2472	  /* We treat "{n}" as "{n,n}".  */
2473	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2474		 : ((token->type == CHARACTER && token->opr.c == ',')
2475		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2476	}
2477      if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2478	{
2479	  /* Invalid sequence.  */
2480	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2481	    {
2482	      if (token->type == END_OF_RE)
2483		*err = REG_EBRACE;
2484	      else
2485		*err = REG_BADBR;
2486
2487	      return NULL;
2488	    }
2489
2490	  /* If the syntax bit is set, rollback.  */
2491	  re_string_set_index (regexp, start_idx);
2492	  *token = start_token;
2493	  token->type = CHARACTER;
2494	  /* mb_partial and word_char bits should be already initialized by
2495	     peek_token.  */
2496	  return elem;
2497	}
2498
2499      if (BE (end != REG_MISSING && start > end, 0))
2500	{
2501	  /* First number greater than second.  */
2502	  *err = REG_BADBR;
2503	  return NULL;
2504	}
2505    }
2506  else
2507    {
2508      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2509      end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2510    }
2511
2512  fetch_token (token, regexp, syntax);
2513
2514  if (BE (elem == NULL, 0))
2515    return NULL;
2516  if (BE (start == 0 && end == 0, 0))
2517    {
2518      postorder (elem, free_tree, NULL);
2519      return NULL;
2520    }
2521
2522  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2523  if (BE (start > 0, 0))
2524    {
2525      tree = elem;
2526      for (i = 2; i <= start; ++i)
2527	{
2528	  elem = duplicate_tree (elem, dfa);
2529	  tree = create_tree (dfa, tree, elem, CONCAT);
2530	  if (BE (elem == NULL || tree == NULL, 0))
2531	    goto parse_dup_op_espace;
2532	}
2533
2534      if (start == end)
2535	return tree;
2536
2537      /* Duplicate ELEM before it is marked optional.  */
2538      elem = duplicate_tree (elem, dfa);
2539      old_tree = tree;
2540    }
2541  else
2542    old_tree = NULL;
2543
2544  if (elem->token.type == SUBEXP)
2545    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2546
2547  tree = create_tree (dfa, elem, NULL,
2548		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2549  if (BE (tree == NULL, 0))
2550    goto parse_dup_op_espace;
2551
2552  /* This loop is actually executed only when end != REG_MISSING,
2553     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2554     already created the start+1-th copy.  */
2555  if ((Idx) -1 < 0 || end != REG_MISSING)
2556    for (i = start + 2; i <= end; ++i)
2557      {
2558	elem = duplicate_tree (elem, dfa);
2559	tree = create_tree (dfa, tree, elem, CONCAT);
2560	if (BE (elem == NULL || tree == NULL, 0))
2561	  goto parse_dup_op_espace;
2562
2563	tree = create_tree (dfa, tree, NULL, OP_ALT);
2564	if (BE (tree == NULL, 0))
2565	  goto parse_dup_op_espace;
2566      }
2567
2568  if (old_tree)
2569    tree = create_tree (dfa, old_tree, tree, CONCAT);
2570
2571  return tree;
2572
2573 parse_dup_op_espace:
2574  *err = REG_ESPACE;
2575  return NULL;
2576}
2577
2578/* Size of the names for collating symbol/equivalence_class/character_class.
2579   I'm not sure, but maybe enough.  */
2580#define BRACKET_NAME_BUF_SIZE 32
2581
2582#ifndef _LIBC
2583  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2584     Build the range expression which starts from START_ELEM, and ends
2585     at END_ELEM.  The result are written to MBCSET and SBCSET.
2586     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2587     mbcset->range_ends, is a pointer argument sinse we may
2588     update it.  */
2589
2590static reg_errcode_t
2591internal_function
2592# ifdef RE_ENABLE_I18N
2593build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2594		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2595# else /* not RE_ENABLE_I18N */
2596build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2597		 bracket_elem_t *end_elem)
2598# endif /* not RE_ENABLE_I18N */
2599{
2600  unsigned int start_ch, end_ch;
2601  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2602  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2603	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2604	  0))
2605    return REG_ERANGE;
2606
2607  /* We can handle no multi character collating elements without libc
2608     support.  */
2609  if (BE ((start_elem->type == COLL_SYM
2610	   && strlen ((char *) start_elem->opr.name) > 1)
2611	  || (end_elem->type == COLL_SYM
2612	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2613    return REG_ECOLLATE;
2614
2615# ifdef RE_ENABLE_I18N
2616  {
2617    wchar_t wc;
2618    wint_t start_wc;
2619    wint_t end_wc;
2620    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2621
2622    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2623		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2624		   : 0));
2625    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2626	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2627		 : 0));
2628    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2629		? __btowc (start_ch) : start_elem->opr.wch);
2630    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2631	      ? __btowc (end_ch) : end_elem->opr.wch);
2632    if (start_wc == WEOF || end_wc == WEOF)
2633      return REG_ECOLLATE;
2634    cmp_buf[0] = start_wc;
2635    cmp_buf[4] = end_wc;
2636    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2637      return REG_ERANGE;
2638
2639    /* Got valid collation sequence values, add them as a new entry.
2640       However, for !_LIBC we have no collation elements: if the
2641       character set is single byte, the single byte character set
2642       that we build below suffices.  parse_bracket_exp passes
2643       no MBCSET if dfa->mb_cur_max == 1.  */
2644    if (mbcset)
2645      {
2646        /* Check the space of the arrays.  */
2647        if (BE (*range_alloc == mbcset->nranges, 0))
2648          {
2649	    /* There is not enough space, need realloc.  */
2650	    wchar_t *new_array_start, *new_array_end;
2651	    Idx new_nranges;
2652
2653	    /* +1 in case of mbcset->nranges is 0.  */
2654	    new_nranges = 2 * mbcset->nranges + 1;
2655	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2656	       are NULL if *range_alloc == 0.  */
2657	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2658				          new_nranges);
2659	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2660				        new_nranges);
2661
2662	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2663	      return REG_ESPACE;
2664
2665	    mbcset->range_starts = new_array_start;
2666	    mbcset->range_ends = new_array_end;
2667	    *range_alloc = new_nranges;
2668          }
2669
2670        mbcset->range_starts[mbcset->nranges] = start_wc;
2671        mbcset->range_ends[mbcset->nranges++] = end_wc;
2672      }
2673
2674    /* Build the table for single byte characters.  */
2675    for (wc = 0; wc < SBC_MAX; ++wc)
2676      {
2677	cmp_buf[2] = wc;
2678	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2679	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2680	  bitset_set (sbcset, wc);
2681      }
2682  }
2683# else /* not RE_ENABLE_I18N */
2684  {
2685    unsigned int ch;
2686    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2687		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2688		   : 0));
2689    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2690	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2691		 : 0));
2692    if (start_ch > end_ch)
2693      return REG_ERANGE;
2694    /* Build the table for single byte characters.  */
2695    for (ch = 0; ch < SBC_MAX; ++ch)
2696      if (start_ch <= ch  && ch <= end_ch)
2697	bitset_set (sbcset, ch);
2698  }
2699# endif /* not RE_ENABLE_I18N */
2700  return REG_NOERROR;
2701}
2702#endif /* not _LIBC */
2703
2704#ifndef _LIBC
2705/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2706   Build the collating element which is represented by NAME.
2707   The result are written to MBCSET and SBCSET.
2708   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2709   pointer argument since we may update it.  */
2710
2711static reg_errcode_t
2712internal_function
2713build_collating_symbol (bitset_t sbcset,
2714# ifdef RE_ENABLE_I18N
2715			re_charset_t *mbcset, Idx *coll_sym_alloc,
2716# endif
2717			const unsigned char *name)
2718{
2719  size_t name_len = strlen ((const char *) name);
2720  if (BE (name_len != 1, 0))
2721    return REG_ECOLLATE;
2722  else
2723    {
2724      bitset_set (sbcset, name[0]);
2725      return REG_NOERROR;
2726    }
2727}
2728#endif /* not _LIBC */
2729
2730/* This function parse bracket expression like "[abc]", "[a-c]",
2731   "[[.a-a.]]" etc.  */
2732
2733static bin_tree_t *
2734parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2735		   reg_syntax_t syntax, reg_errcode_t *err)
2736{
2737#ifdef _LIBC
2738  const unsigned char *collseqmb;
2739  const char *collseqwc;
2740  uint32_t nrules;
2741  int32_t table_size;
2742  const int32_t *symb_table;
2743  const unsigned char *extra;
2744
2745  /* Local function for parse_bracket_exp used in _LIBC environement.
2746     Seek the collating symbol entry correspondings to NAME.
2747     Return the index of the symbol in the SYMB_TABLE.  */
2748
2749  auto inline int32_t
2750  __attribute ((always_inline))
2751  seek_collating_symbol_entry (name, name_len)
2752	 const unsigned char *name;
2753	 size_t name_len;
2754    {
2755      int32_t hash = elem_hash ((const char *) name, name_len);
2756      int32_t elem = hash % table_size;
2757      if (symb_table[2 * elem] != 0)
2758	{
2759	  int32_t second = hash % (table_size - 2) + 1;
2760
2761	  do
2762	    {
2763	      /* First compare the hashing value.  */
2764	      if (symb_table[2 * elem] == hash
2765		  /* Compare the length of the name.  */
2766		  && name_len == extra[symb_table[2 * elem + 1]]
2767		  /* Compare the name.  */
2768		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2769			     name_len) == 0)
2770		{
2771		  /* Yep, this is the entry.  */
2772		  break;
2773		}
2774
2775	      /* Next entry.  */
2776	      elem += second;
2777	    }
2778	  while (symb_table[2 * elem] != 0);
2779	}
2780      return elem;
2781    }
2782
2783  /* Local function for parse_bracket_exp used in _LIBC environement.
2784     Look up the collation sequence value of BR_ELEM.
2785     Return the value if succeeded, UINT_MAX otherwise.  */
2786
2787  auto inline unsigned int
2788  __attribute ((always_inline))
2789  lookup_collation_sequence_value (br_elem)
2790	 bracket_elem_t *br_elem;
2791    {
2792      if (br_elem->type == SB_CHAR)
2793	{
2794	  /*
2795	  if (MB_CUR_MAX == 1)
2796	  */
2797	  if (nrules == 0)
2798	    return collseqmb[br_elem->opr.ch];
2799	  else
2800	    {
2801	      wint_t wc = __btowc (br_elem->opr.ch);
2802	      return __collseq_table_lookup (collseqwc, wc);
2803	    }
2804	}
2805      else if (br_elem->type == MB_CHAR)
2806	{
2807	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2808	}
2809      else if (br_elem->type == COLL_SYM)
2810	{
2811	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2812	  if (nrules != 0)
2813	    {
2814	      int32_t elem, idx;
2815	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2816						  sym_name_len);
2817	      if (symb_table[2 * elem] != 0)
2818		{
2819		  /* We found the entry.  */
2820		  idx = symb_table[2 * elem + 1];
2821		  /* Skip the name of collating element name.  */
2822		  idx += 1 + extra[idx];
2823		  /* Skip the byte sequence of the collating element.  */
2824		  idx += 1 + extra[idx];
2825		  /* Adjust for the alignment.  */
2826		  idx = (idx + 3) & ~3;
2827		  /* Skip the multibyte collation sequence value.  */
2828		  idx += sizeof (unsigned int);
2829		  /* Skip the wide char sequence of the collating element.  */
2830		  idx += sizeof (unsigned int) *
2831		    (1 + *(unsigned int *) (extra + idx));
2832		  /* Return the collation sequence value.  */
2833		  return *(unsigned int *) (extra + idx);
2834		}
2835	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2836		{
2837		  /* No valid character.  Match it as a single byte
2838		     character.  */
2839		  return collseqmb[br_elem->opr.name[0]];
2840		}
2841	    }
2842	  else if (sym_name_len == 1)
2843	    return collseqmb[br_elem->opr.name[0]];
2844	}
2845      return UINT_MAX;
2846    }
2847
2848  /* Local function for parse_bracket_exp used in _LIBC environement.
2849     Build the range expression which starts from START_ELEM, and ends
2850     at END_ELEM.  The result are written to MBCSET and SBCSET.
2851     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2852     mbcset->range_ends, is a pointer argument sinse we may
2853     update it.  */
2854
2855  auto inline reg_errcode_t
2856  __attribute ((always_inline))
2857  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2858	 re_charset_t *mbcset;
2859	 Idx *range_alloc;
2860	 bitset_t sbcset;
2861	 bracket_elem_t *start_elem, *end_elem;
2862    {
2863      unsigned int ch;
2864      uint32_t start_collseq;
2865      uint32_t end_collseq;
2866
2867      /* Equivalence Classes and Character Classes can't be a range
2868	 start/end.  */
2869      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2870	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2871	      0))
2872	return REG_ERANGE;
2873
2874      start_collseq = lookup_collation_sequence_value (start_elem);
2875      end_collseq = lookup_collation_sequence_value (end_elem);
2876      /* Check start/end collation sequence values.  */
2877      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2878	return REG_ECOLLATE;
2879      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2880	return REG_ERANGE;
2881
2882      /* Got valid collation sequence values, add them as a new entry.
2883	 However, if we have no collation elements, and the character set
2884	 is single byte, the single byte character set that we
2885	 build below suffices. */
2886      if (nrules > 0 || dfa->mb_cur_max > 1)
2887	{
2888          /* Check the space of the arrays.  */
2889          if (BE (*range_alloc == mbcset->nranges, 0))
2890	    {
2891	      /* There is not enough space, need realloc.  */
2892	      uint32_t *new_array_start;
2893	      uint32_t *new_array_end;
2894	      Idx new_nranges;
2895
2896	      /* +1 in case of mbcset->nranges is 0.  */
2897	      new_nranges = 2 * mbcset->nranges + 1;
2898	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2899					    new_nranges);
2900	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2901				          new_nranges);
2902
2903	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2904	        return REG_ESPACE;
2905
2906	      mbcset->range_starts = new_array_start;
2907	      mbcset->range_ends = new_array_end;
2908	      *range_alloc = new_nranges;
2909	    }
2910
2911          mbcset->range_starts[mbcset->nranges] = start_collseq;
2912          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2913	}
2914
2915      /* Build the table for single byte characters.  */
2916      for (ch = 0; ch < SBC_MAX; ch++)
2917	{
2918	  uint32_t ch_collseq;
2919	  /*
2920	  if (MB_CUR_MAX == 1)
2921	  */
2922	  if (nrules == 0)
2923	    ch_collseq = collseqmb[ch];
2924	  else
2925	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2926	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2927	    bitset_set (sbcset, ch);
2928	}
2929      return REG_NOERROR;
2930    }
2931
2932  /* Local function for parse_bracket_exp used in _LIBC environement.
2933     Build the collating element which is represented by NAME.
2934     The result are written to MBCSET and SBCSET.
2935     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2936     pointer argument sinse we may update it.  */
2937
2938  auto inline reg_errcode_t
2939  __attribute ((always_inline))
2940  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2941	 re_charset_t *mbcset;
2942	 Idx *coll_sym_alloc;
2943	 bitset_t sbcset;
2944	 const unsigned char *name;
2945    {
2946      int32_t elem, idx;
2947      size_t name_len = strlen ((const char *) name);
2948      if (nrules != 0)
2949	{
2950	  elem = seek_collating_symbol_entry (name, name_len);
2951	  if (symb_table[2 * elem] != 0)
2952	    {
2953	      /* We found the entry.  */
2954	      idx = symb_table[2 * elem + 1];
2955	      /* Skip the name of collating element name.  */
2956	      idx += 1 + extra[idx];
2957	    }
2958	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2959	    {
2960	      /* No valid character, treat it as a normal
2961		 character.  */
2962	      bitset_set (sbcset, name[0]);
2963	      return REG_NOERROR;
2964	    }
2965	  else
2966	    return REG_ECOLLATE;
2967
2968	  /* Got valid collation sequence, add it as a new entry.  */
2969	  /* Check the space of the arrays.  */
2970	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2971	    {
2972	      /* Not enough, realloc it.  */
2973	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2974	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2975	      /* Use realloc since mbcset->coll_syms is NULL
2976		 if *alloc == 0.  */
2977	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2978						   new_coll_sym_alloc);
2979	      if (BE (new_coll_syms == NULL, 0))
2980		return REG_ESPACE;
2981	      mbcset->coll_syms = new_coll_syms;
2982	      *coll_sym_alloc = new_coll_sym_alloc;
2983	    }
2984	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2985	  return REG_NOERROR;
2986	}
2987      else
2988	{
2989	  if (BE (name_len != 1, 0))
2990	    return REG_ECOLLATE;
2991	  else
2992	    {
2993	      bitset_set (sbcset, name[0]);
2994	      return REG_NOERROR;
2995	    }
2996	}
2997    }
2998#endif
2999
3000  re_token_t br_token;
3001  re_bitset_ptr_t sbcset;
3002#ifdef RE_ENABLE_I18N
3003  re_charset_t *mbcset;
3004  Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3005  Idx equiv_class_alloc = 0, char_class_alloc = 0;
3006#endif /* not RE_ENABLE_I18N */
3007  bool non_match = false;
3008  bin_tree_t *work_tree;
3009  int token_len;
3010  bool first_round = true;
3011#ifdef _LIBC
3012  collseqmb = (const unsigned char *)
3013    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3014  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3015  if (nrules)
3016    {
3017      /*
3018      if (MB_CUR_MAX > 1)
3019      */
3020      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3021      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3022      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3023						  _NL_COLLATE_SYMB_TABLEMB);
3024      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3025						   _NL_COLLATE_SYMB_EXTRAMB);
3026    }
3027#endif
3028  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3029#ifdef RE_ENABLE_I18N
3030  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3031#endif /* RE_ENABLE_I18N */
3032#ifdef RE_ENABLE_I18N
3033  if (BE (sbcset == NULL || mbcset == NULL, 0))
3034#else
3035  if (BE (sbcset == NULL, 0))
3036#endif /* RE_ENABLE_I18N */
3037    {
3038      *err = REG_ESPACE;
3039      return NULL;
3040    }
3041
3042  token_len = peek_token_bracket (token, regexp, syntax);
3043  if (BE (token->type == END_OF_RE, 0))
3044    {
3045      *err = REG_BADPAT;
3046      goto parse_bracket_exp_free_return;
3047    }
3048  if (token->type == OP_NON_MATCH_LIST)
3049    {
3050#ifdef RE_ENABLE_I18N
3051      mbcset->non_match = 1;
3052#endif /* not RE_ENABLE_I18N */
3053      non_match = true;
3054      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3055	bitset_set (sbcset, '\n');
3056      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3057      token_len = peek_token_bracket (token, regexp, syntax);
3058      if (BE (token->type == END_OF_RE, 0))
3059	{
3060	  *err = REG_BADPAT;
3061	  goto parse_bracket_exp_free_return;
3062	}
3063    }
3064
3065  /* We treat the first ']' as a normal character.  */
3066  if (token->type == OP_CLOSE_BRACKET)
3067    token->type = CHARACTER;
3068
3069  while (1)
3070    {
3071      bracket_elem_t start_elem, end_elem;
3072      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3073      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3074      reg_errcode_t ret;
3075      int token_len2 = 0;
3076      bool is_range_exp = false;
3077      re_token_t token2;
3078
3079      start_elem.opr.name = start_name_buf;
3080      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3081				   syntax, first_round);
3082      if (BE (ret != REG_NOERROR, 0))
3083	{
3084	  *err = ret;
3085	  goto parse_bracket_exp_free_return;
3086	}
3087      first_round = false;
3088
3089      /* Get information about the next token.  We need it in any case.  */
3090      token_len = peek_token_bracket (token, regexp, syntax);
3091
3092      /* Do not check for ranges if we know they are not allowed.  */
3093      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3094	{
3095	  if (BE (token->type == END_OF_RE, 0))
3096	    {
3097	      *err = REG_EBRACK;
3098	      goto parse_bracket_exp_free_return;
3099	    }
3100	  if (token->type == OP_CHARSET_RANGE)
3101	    {
3102	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3103	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3104	      if (BE (token2.type == END_OF_RE, 0))
3105		{
3106		  *err = REG_EBRACK;
3107		  goto parse_bracket_exp_free_return;
3108		}
3109	      if (token2.type == OP_CLOSE_BRACKET)
3110		{
3111		  /* We treat the last '-' as a normal character.  */
3112		  re_string_skip_bytes (regexp, -token_len);
3113		  token->type = CHARACTER;
3114		}
3115	      else
3116		is_range_exp = true;
3117	    }
3118	}
3119
3120      if (is_range_exp == true)
3121	{
3122	  end_elem.opr.name = end_name_buf;
3123	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3124				       dfa, syntax, true);
3125	  if (BE (ret != REG_NOERROR, 0))
3126	    {
3127	      *err = ret;
3128	      goto parse_bracket_exp_free_return;
3129	    }
3130
3131	  token_len = peek_token_bracket (token, regexp, syntax);
3132
3133#ifdef _LIBC
3134	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3135				  &start_elem, &end_elem);
3136#else
3137# ifdef RE_ENABLE_I18N
3138	  *err = build_range_exp (sbcset,
3139				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3140				  &range_alloc, &start_elem, &end_elem);
3141# else
3142	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3143# endif
3144#endif /* RE_ENABLE_I18N */
3145	  if (BE (*err != REG_NOERROR, 0))
3146	    goto parse_bracket_exp_free_return;
3147	}
3148      else
3149	{
3150	  switch (start_elem.type)
3151	    {
3152	    case SB_CHAR:
3153	      bitset_set (sbcset, start_elem.opr.ch);
3154	      break;
3155#ifdef RE_ENABLE_I18N
3156	    case MB_CHAR:
3157	      /* Check whether the array has enough space.  */
3158	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3159		{
3160		  wchar_t *new_mbchars;
3161		  /* Not enough, realloc it.  */
3162		  /* +1 in case of mbcset->nmbchars is 0.  */
3163		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3164		  /* Use realloc since array is NULL if *alloc == 0.  */
3165		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3166					    mbchar_alloc);
3167		  if (BE (new_mbchars == NULL, 0))
3168		    goto parse_bracket_exp_espace;
3169		  mbcset->mbchars = new_mbchars;
3170		}
3171	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3172	      break;
3173#endif /* RE_ENABLE_I18N */
3174	    case EQUIV_CLASS:
3175	      *err = build_equiv_class (sbcset,
3176#ifdef RE_ENABLE_I18N
3177					mbcset, &equiv_class_alloc,
3178#endif /* RE_ENABLE_I18N */
3179					start_elem.opr.name);
3180	      if (BE (*err != REG_NOERROR, 0))
3181		goto parse_bracket_exp_free_return;
3182	      break;
3183	    case COLL_SYM:
3184	      *err = build_collating_symbol (sbcset,
3185#ifdef RE_ENABLE_I18N
3186					     mbcset, &coll_sym_alloc,
3187#endif /* RE_ENABLE_I18N */
3188					     start_elem.opr.name);
3189	      if (BE (*err != REG_NOERROR, 0))
3190		goto parse_bracket_exp_free_return;
3191	      break;
3192	    case CHAR_CLASS:
3193	      *err = build_charclass (regexp->trans, sbcset,
3194#ifdef RE_ENABLE_I18N
3195				      mbcset, &char_class_alloc,
3196#endif /* RE_ENABLE_I18N */
3197				      start_elem.opr.name, syntax);
3198	      if (BE (*err != REG_NOERROR, 0))
3199	       goto parse_bracket_exp_free_return;
3200	      break;
3201	    default:
3202	      assert (0);
3203	      break;
3204	    }
3205	}
3206      if (BE (token->type == END_OF_RE, 0))
3207	{
3208	  *err = REG_EBRACK;
3209	  goto parse_bracket_exp_free_return;
3210	}
3211      if (token->type == OP_CLOSE_BRACKET)
3212	break;
3213    }
3214
3215  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3216
3217  /* If it is non-matching list.  */
3218  if (non_match)
3219    bitset_not (sbcset);
3220
3221#ifdef RE_ENABLE_I18N
3222  /* Ensure only single byte characters are set.  */
3223  if (dfa->mb_cur_max > 1)
3224    bitset_mask (sbcset, dfa->sb_char);
3225
3226  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3227      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3228						     || mbcset->non_match)))
3229    {
3230      bin_tree_t *mbc_tree;
3231      int sbc_idx;
3232      /* Build a tree for complex bracket.  */
3233      dfa->has_mb_node = 1;
3234      br_token.type = COMPLEX_BRACKET;
3235      br_token.opr.mbcset = mbcset;
3236      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3237      if (BE (mbc_tree == NULL, 0))
3238	goto parse_bracket_exp_espace;
3239      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3240	if (sbcset[sbc_idx])
3241	  break;
3242      /* If there are no bits set in sbcset, there is no point
3243	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3244      if (sbc_idx < BITSET_WORDS)
3245	{
3246          /* Build a tree for simple bracket.  */
3247          br_token.type = SIMPLE_BRACKET;
3248          br_token.opr.sbcset = sbcset;
3249          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3250          if (BE (work_tree == NULL, 0))
3251            goto parse_bracket_exp_espace;
3252
3253          /* Then join them by ALT node.  */
3254          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3255          if (BE (work_tree == NULL, 0))
3256            goto parse_bracket_exp_espace;
3257	}
3258      else
3259	{
3260	  re_free (sbcset);
3261	  work_tree = mbc_tree;
3262	}
3263    }
3264  else
3265#endif /* not RE_ENABLE_I18N */
3266    {
3267#ifdef RE_ENABLE_I18N
3268      free_charset (mbcset);
3269#endif
3270      /* Build a tree for simple bracket.  */
3271      br_token.type = SIMPLE_BRACKET;
3272      br_token.opr.sbcset = sbcset;
3273      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3274      if (BE (work_tree == NULL, 0))
3275        goto parse_bracket_exp_espace;
3276    }
3277  return work_tree;
3278
3279 parse_bracket_exp_espace:
3280  *err = REG_ESPACE;
3281 parse_bracket_exp_free_return:
3282  re_free (sbcset);
3283#ifdef RE_ENABLE_I18N
3284  free_charset (mbcset);
3285#endif /* RE_ENABLE_I18N */
3286  return NULL;
3287}
3288
3289/* Parse an element in the bracket expression.  */
3290
3291static reg_errcode_t
3292parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3293		       re_token_t *token, int token_len, re_dfa_t *dfa,
3294		       reg_syntax_t syntax, bool accept_hyphen)
3295{
3296#ifdef RE_ENABLE_I18N
3297  int cur_char_size;
3298  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3299  if (cur_char_size > 1)
3300    {
3301      elem->type = MB_CHAR;
3302      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3303      re_string_skip_bytes (regexp, cur_char_size);
3304      return REG_NOERROR;
3305    }
3306#endif /* RE_ENABLE_I18N */
3307  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3308  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3309      || token->type == OP_OPEN_EQUIV_CLASS)
3310    return parse_bracket_symbol (elem, regexp, token);
3311  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3312    {
3313      /* A '-' must only appear as anything but a range indicator before
3314	 the closing bracket.  Everything else is an error.  */
3315      re_token_t token2;
3316      (void) peek_token_bracket (&token2, regexp, syntax);
3317      if (token2.type != OP_CLOSE_BRACKET)
3318	/* The actual error value is not standardized since this whole
3319	   case is undefined.  But ERANGE makes good sense.  */
3320	return REG_ERANGE;
3321    }
3322  elem->type = SB_CHAR;
3323  elem->opr.ch = token->opr.c;
3324  return REG_NOERROR;
3325}
3326
3327/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3328   such as [:<character_class>:], [.<collating_element>.], and
3329   [=<equivalent_class>=].  */
3330
3331static reg_errcode_t
3332parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3333		      re_token_t *token)
3334{
3335  unsigned char ch, delim = token->opr.c;
3336  int i = 0;
3337  if (re_string_eoi(regexp))
3338    return REG_EBRACK;
3339  for (;; ++i)
3340    {
3341      if (i >= BRACKET_NAME_BUF_SIZE)
3342	return REG_EBRACK;
3343      if (token->type == OP_OPEN_CHAR_CLASS)
3344	ch = re_string_fetch_byte_case (regexp);
3345      else
3346	ch = re_string_fetch_byte (regexp);
3347      if (re_string_eoi(regexp))
3348	return REG_EBRACK;
3349      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3350	break;
3351      elem->opr.name[i] = ch;
3352    }
3353  re_string_skip_bytes (regexp, 1);
3354  elem->opr.name[i] = '\0';
3355  switch (token->type)
3356    {
3357    case OP_OPEN_COLL_ELEM:
3358      elem->type = COLL_SYM;
3359      break;
3360    case OP_OPEN_EQUIV_CLASS:
3361      elem->type = EQUIV_CLASS;
3362      break;
3363    case OP_OPEN_CHAR_CLASS:
3364      elem->type = CHAR_CLASS;
3365      break;
3366    default:
3367      break;
3368    }
3369  return REG_NOERROR;
3370}
3371
3372  /* Helper function for parse_bracket_exp.
3373     Build the equivalence class which is represented by NAME.
3374     The result are written to MBCSET and SBCSET.
3375     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3376     is a pointer argument sinse we may update it.  */
3377
3378static reg_errcode_t
3379#ifdef RE_ENABLE_I18N
3380build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3381		   Idx *equiv_class_alloc, const unsigned char *name)
3382#else /* not RE_ENABLE_I18N */
3383build_equiv_class (bitset_t sbcset, const unsigned char *name)
3384#endif /* not RE_ENABLE_I18N */
3385{
3386#ifdef _LIBC
3387  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388  if (nrules != 0)
3389    {
3390      const int32_t *table, *indirect;
3391      const unsigned char *weights, *extra, *cp;
3392      unsigned char char_buf[2];
3393      int32_t idx1, idx2;
3394      unsigned int ch;
3395      size_t len;
3396      /* This #include defines a local function!  */
3397# include <locale/weight.h>
3398      /* Calculate the index for equivalence class.  */
3399      cp = name;
3400      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3401      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402					       _NL_COLLATE_WEIGHTMB);
3403      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3404						   _NL_COLLATE_EXTRAMB);
3405      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3406						_NL_COLLATE_INDIRECTMB);
3407      idx1 = findidx (&cp);
3408      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3409	/* This isn't a valid character.  */
3410	return REG_ECOLLATE;
3411
3412      /* Build single byte matcing table for this equivalence class.  */
3413      char_buf[1] = (unsigned char) '\0';
3414      len = weights[idx1];
3415      for (ch = 0; ch < SBC_MAX; ++ch)
3416	{
3417	  char_buf[0] = ch;
3418	  cp = char_buf;
3419	  idx2 = findidx (&cp);
3420/*
3421	  idx2 = table[ch];
3422*/
3423	  if (idx2 == 0)
3424	    /* This isn't a valid character.  */
3425	    continue;
3426	  if (len == weights[idx2])
3427	    {
3428	      int cnt = 0;
3429	      while (cnt <= len &&
3430		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3431		++cnt;
3432
3433	      if (cnt > len)
3434		bitset_set (sbcset, ch);
3435	    }
3436	}
3437      /* Check whether the array has enough space.  */
3438      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3439	{
3440	  /* Not enough, realloc it.  */
3441	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3442	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3443	  /* Use realloc since the array is NULL if *alloc == 0.  */
3444	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3445						   int32_t,
3446						   new_equiv_class_alloc);
3447	  if (BE (new_equiv_classes == NULL, 0))
3448	    return REG_ESPACE;
3449	  mbcset->equiv_classes = new_equiv_classes;
3450	  *equiv_class_alloc = new_equiv_class_alloc;
3451	}
3452      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3453    }
3454  else
3455#endif /* _LIBC */
3456    {
3457      if (BE (strlen ((const char *) name) != 1, 0))
3458	return REG_ECOLLATE;
3459      bitset_set (sbcset, *name);
3460    }
3461  return REG_NOERROR;
3462}
3463
3464  /* Helper function for parse_bracket_exp.
3465     Build the character class which is represented by NAME.
3466     The result are written to MBCSET and SBCSET.
3467     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3468     is a pointer argument sinse we may update it.  */
3469
3470static reg_errcode_t
3471#ifdef RE_ENABLE_I18N
3472build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3473		 re_charset_t *mbcset, Idx *char_class_alloc,
3474		 const unsigned char *class_name, reg_syntax_t syntax)
3475#else /* not RE_ENABLE_I18N */
3476build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3477		 const unsigned char *class_name, reg_syntax_t syntax)
3478#endif /* not RE_ENABLE_I18N */
3479{
3480  int i;
3481  const char *name = (const char *) class_name;
3482
3483  /* In case of REG_ICASE "upper" and "lower" match the both of
3484     upper and lower cases.  */
3485  if ((syntax & RE_ICASE)
3486      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3487    name = "alpha";
3488
3489#ifdef RE_ENABLE_I18N
3490  /* Check the space of the arrays.  */
3491  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3492    {
3493      /* Not enough, realloc it.  */
3494      /* +1 in case of mbcset->nchar_classes is 0.  */
3495      Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3496      /* Use realloc since array is NULL if *alloc == 0.  */
3497      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3498					       new_char_class_alloc);
3499      if (BE (new_char_classes == NULL, 0))
3500	return REG_ESPACE;
3501      mbcset->char_classes = new_char_classes;
3502      *char_class_alloc = new_char_class_alloc;
3503    }
3504  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3505#endif /* RE_ENABLE_I18N */
3506
3507#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3508  do {						\
3509    if (BE (trans != NULL, 0))			\
3510      {						\
3511	for (i = 0; i < SBC_MAX; ++i)		\
3512	  if (ctype_func (i))			\
3513	    bitset_set (sbcset, trans[i]);	\
3514      }						\
3515    else					\
3516      {						\
3517	for (i = 0; i < SBC_MAX; ++i)		\
3518	  if (ctype_func (i))			\
3519	    bitset_set (sbcset, i);		\
3520      }						\
3521  } while (0)
3522
3523  if (strcmp (name, "alnum") == 0)
3524    BUILD_CHARCLASS_LOOP (isalnum);
3525  else if (strcmp (name, "cntrl") == 0)
3526    BUILD_CHARCLASS_LOOP (iscntrl);
3527  else if (strcmp (name, "lower") == 0)
3528    BUILD_CHARCLASS_LOOP (islower);
3529  else if (strcmp (name, "space") == 0)
3530    BUILD_CHARCLASS_LOOP (isspace);
3531  else if (strcmp (name, "alpha") == 0)
3532    BUILD_CHARCLASS_LOOP (isalpha);
3533  else if (strcmp (name, "digit") == 0)
3534    BUILD_CHARCLASS_LOOP (isdigit);
3535  else if (strcmp (name, "print") == 0)
3536    BUILD_CHARCLASS_LOOP (isprint);
3537  else if (strcmp (name, "upper") == 0)
3538    BUILD_CHARCLASS_LOOP (isupper);
3539  else if (strcmp (name, "blank") == 0)
3540    BUILD_CHARCLASS_LOOP (isblank);
3541  else if (strcmp (name, "graph") == 0)
3542    BUILD_CHARCLASS_LOOP (isgraph);
3543  else if (strcmp (name, "punct") == 0)
3544    BUILD_CHARCLASS_LOOP (ispunct);
3545  else if (strcmp (name, "xdigit") == 0)
3546    BUILD_CHARCLASS_LOOP (isxdigit);
3547  else
3548    return REG_ECTYPE;
3549
3550  return REG_NOERROR;
3551}
3552
3553static bin_tree_t *
3554build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3555		    const unsigned char *class_name,
3556		    const unsigned char *extra, bool non_match,
3557		    reg_errcode_t *err)
3558{
3559  re_bitset_ptr_t sbcset;
3560#ifdef RE_ENABLE_I18N
3561  re_charset_t *mbcset;
3562  Idx alloc = 0;
3563#endif /* not RE_ENABLE_I18N */
3564  reg_errcode_t ret;
3565  re_token_t br_token;
3566  bin_tree_t *tree;
3567
3568  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3569#ifdef RE_ENABLE_I18N
3570  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3571#endif /* RE_ENABLE_I18N */
3572
3573#ifdef RE_ENABLE_I18N
3574  if (BE (sbcset == NULL || mbcset == NULL, 0))
3575#else /* not RE_ENABLE_I18N */
3576  if (BE (sbcset == NULL, 0))
3577#endif /* not RE_ENABLE_I18N */
3578    {
3579      *err = REG_ESPACE;
3580      return NULL;
3581    }
3582
3583  if (non_match)
3584    {
3585#ifdef RE_ENABLE_I18N
3586      mbcset->non_match = 1;
3587#endif /* not RE_ENABLE_I18N */
3588    }
3589
3590  /* We don't care the syntax in this case.  */
3591  ret = build_charclass (trans, sbcset,
3592#ifdef RE_ENABLE_I18N
3593			 mbcset, &alloc,
3594#endif /* RE_ENABLE_I18N */
3595			 class_name, 0);
3596
3597  if (BE (ret != REG_NOERROR, 0))
3598    {
3599      re_free (sbcset);
3600#ifdef RE_ENABLE_I18N
3601      free_charset (mbcset);
3602#endif /* RE_ENABLE_I18N */
3603      *err = ret;
3604      return NULL;
3605    }
3606  /* \w match '_' also.  */
3607  for (; *extra; extra++)
3608    bitset_set (sbcset, *extra);
3609
3610  /* If it is non-matching list.  */
3611  if (non_match)
3612    bitset_not (sbcset);
3613
3614#ifdef RE_ENABLE_I18N
3615  /* Ensure only single byte characters are set.  */
3616  if (dfa->mb_cur_max > 1)
3617    bitset_mask (sbcset, dfa->sb_char);
3618#endif
3619
3620  /* Build a tree for simple bracket.  */
3621  br_token.type = SIMPLE_BRACKET;
3622  br_token.opr.sbcset = sbcset;
3623  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3624  if (BE (tree == NULL, 0))
3625    goto build_word_op_espace;
3626
3627#ifdef RE_ENABLE_I18N
3628  if (dfa->mb_cur_max > 1)
3629    {
3630      bin_tree_t *mbc_tree;
3631      /* Build a tree for complex bracket.  */
3632      br_token.type = COMPLEX_BRACKET;
3633      br_token.opr.mbcset = mbcset;
3634      dfa->has_mb_node = 1;
3635      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3636      if (BE (mbc_tree == NULL, 0))
3637	goto build_word_op_espace;
3638      /* Then join them by ALT node.  */
3639      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3640      if (BE (mbc_tree != NULL, 1))
3641	return tree;
3642    }
3643  else
3644    {
3645      free_charset (mbcset);
3646      return tree;
3647    }
3648#else /* not RE_ENABLE_I18N */
3649  return tree;
3650#endif /* not RE_ENABLE_I18N */
3651
3652 build_word_op_espace:
3653  re_free (sbcset);
3654#ifdef RE_ENABLE_I18N
3655  free_charset (mbcset);
3656#endif /* RE_ENABLE_I18N */
3657  *err = REG_ESPACE;
3658  return NULL;
3659}
3660
3661/* This is intended for the expressions like "a{1,3}".
3662   Fetch a number from `input', and return the number.
3663   Return REG_MISSING if the number field is empty like "{,1}".
3664   Return REG_ERROR if an error occurred.  */
3665
3666static Idx
3667fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3668{
3669  Idx num = REG_MISSING;
3670  unsigned char c;
3671  while (1)
3672    {
3673      fetch_token (token, input, syntax);
3674      c = token->opr.c;
3675      if (BE (token->type == END_OF_RE, 0))
3676	return REG_ERROR;
3677      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3678	break;
3679      num = ((token->type != CHARACTER || c < '0' || '9' < c
3680	      || num == REG_ERROR)
3681	     ? REG_ERROR
3682	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3683      num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3684    }
3685  return num;
3686}
3687
3688#ifdef RE_ENABLE_I18N
3689static void
3690free_charset (re_charset_t *cset)
3691{
3692  re_free (cset->mbchars);
3693# ifdef _LIBC
3694  re_free (cset->coll_syms);
3695  re_free (cset->equiv_classes);
3696  re_free (cset->range_starts);
3697  re_free (cset->range_ends);
3698# endif
3699  re_free (cset->char_classes);
3700  re_free (cset);
3701}
3702#endif /* RE_ENABLE_I18N */
3703
3704/* Functions for binary tree operation.  */
3705
3706/* Create a tree node.  */
3707
3708static bin_tree_t *
3709create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3710	     re_token_type_t type)
3711{
3712  re_token_t t;
3713  t.type = type;
3714  return create_token_tree (dfa, left, right, &t);
3715}
3716
3717static bin_tree_t *
3718create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3719		   const re_token_t *token)
3720{
3721  bin_tree_t *tree;
3722  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3723    {
3724      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3725
3726      if (storage == NULL)
3727	return NULL;
3728      storage->next = dfa->str_tree_storage;
3729      dfa->str_tree_storage = storage;
3730      dfa->str_tree_storage_idx = 0;
3731    }
3732  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3733
3734  tree->parent = NULL;
3735  tree->left = left;
3736  tree->right = right;
3737  tree->token = *token;
3738  tree->token.duplicated = 0;
3739  tree->token.opt_subexp = 0;
3740  tree->first = NULL;
3741  tree->next = NULL;
3742  tree->node_idx = REG_MISSING;
3743
3744  if (left != NULL)
3745    left->parent = tree;
3746  if (right != NULL)
3747    right->parent = tree;
3748  return tree;
3749}
3750
3751/* Mark the tree SRC as an optional subexpression.
3752   To be called from preorder or postorder.  */
3753
3754static reg_errcode_t
3755mark_opt_subexp (void *extra, bin_tree_t *node)
3756{
3757  Idx idx = (Idx) (long) extra;
3758  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3759    node->token.opt_subexp = 1;
3760
3761  return REG_NOERROR;
3762}
3763
3764/* Free the allocated memory inside NODE. */
3765
3766static void
3767free_token (re_token_t *node)
3768{
3769#ifdef RE_ENABLE_I18N
3770  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3771    free_charset (node->opr.mbcset);
3772  else
3773#endif /* RE_ENABLE_I18N */
3774    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3775      re_free (node->opr.sbcset);
3776}
3777
3778/* Worker function for tree walking.  Free the allocated memory inside NODE
3779   and its children. */
3780
3781static reg_errcode_t
3782free_tree (void *extra, bin_tree_t *node)
3783{
3784  free_token (&node->token);
3785  return REG_NOERROR;
3786}
3787
3788
3789/* Duplicate the node SRC, and return new node.  This is a preorder
3790   visit similar to the one implemented by the generic visitor, but
3791   we need more infrastructure to maintain two parallel trees --- so,
3792   it's easier to duplicate.  */
3793
3794static bin_tree_t *
3795duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3796{
3797  const bin_tree_t *node;
3798  bin_tree_t *dup_root;
3799  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3800
3801  for (node = root; ; )
3802    {
3803      /* Create a new tree and link it back to the current parent.  */
3804      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3805      if (*p_new == NULL)
3806	return NULL;
3807      (*p_new)->parent = dup_node;
3808      (*p_new)->token.duplicated = 1;
3809      dup_node = *p_new;
3810
3811      /* Go to the left node, or up and to the right.  */
3812      if (node->left)
3813	{
3814	  node = node->left;
3815	  p_new = &dup_node->left;
3816	}
3817      else
3818	{
3819	  const bin_tree_t *prev = NULL;
3820	  while (node->right == prev || node->right == NULL)
3821	    {
3822	      prev = node;
3823	      node = node->parent;
3824	      dup_node = dup_node->parent;
3825	      if (!node)
3826	        return dup_root;
3827	    }
3828	  node = node->right;
3829	  p_new = &dup_node->right;
3830	}
3831    }
3832}
3833