1/* Extended regular expression matching and search library.
2   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License along
17   with this program; if not, write to the Free Software Foundation,
18   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21					  Idx length, reg_syntax_t syntax);
22static void re_compile_fastmap_iter (regex_t *bufp,
23				     const re_dfastate_t *init_state,
24				     char *fastmap);
25static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
26#ifdef RE_ENABLE_I18N
27static void free_charset (re_charset_t *cset);
28#endif /* RE_ENABLE_I18N */
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36			       reg_errcode_t (fn (void *, bin_tree_t *)),
37			       void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39				reg_errcode_t (fn (void *, bin_tree_t *)),
40				void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44				 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50				   unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53					 Idx node, bool root);
54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55static Idx fetch_number (re_string_t *input, re_token_t *token,
56			 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
58			reg_syntax_t syntax);
59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60			  reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62				  re_token_t *token, reg_syntax_t syntax,
63				  Idx nest, reg_errcode_t *err);
64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65				 re_token_t *token, reg_syntax_t syntax,
66				 Idx nest, reg_errcode_t *err);
67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68				     re_token_t *token, reg_syntax_t syntax,
69				     Idx nest, reg_errcode_t *err);
70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71				  re_token_t *token, reg_syntax_t syntax,
72				  Idx nest, reg_errcode_t *err);
73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74				 re_dfa_t *dfa, re_token_t *token,
75				 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77				      re_token_t *token, reg_syntax_t syntax,
78				      reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80					    re_string_t *regexp,
81					    re_token_t *token, int token_len,
82					    re_dfa_t *dfa,
83					    reg_syntax_t syntax,
84					    bool accept_hyphen);
85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86					  re_string_t *regexp,
87					  re_token_t *token);
88#ifdef RE_ENABLE_I18N
89static reg_errcode_t build_equiv_class (bitset sbcset,
90					re_charset_t *mbcset,
91					Idx *equiv_class_alloc,
92					const unsigned char *name);
93static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94				      bitset sbcset,
95				      re_charset_t *mbcset,
96				      Idx *char_class_alloc,
97				      const unsigned char *class_name,
98				      reg_syntax_t syntax);
99#else  /* not RE_ENABLE_I18N */
100static reg_errcode_t build_equiv_class (bitset sbcset,
101					const unsigned char *name);
102static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103				      bitset sbcset,
104				      const unsigned char *class_name,
105				      reg_syntax_t syntax);
106#endif /* not RE_ENABLE_I18N */
107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108				       unsigned REG_TRANSLATE_TYPE trans,
109				       const unsigned char *class_name,
110				       const unsigned char *extra,
111				       bool non_match, reg_errcode_t *err);
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113				bin_tree_t *left, bin_tree_t *right,
114				re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116				      bin_tree_t *left, bin_tree_t *right,
117				      const re_token_t *token);
118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122
123/* This table gives an error message for each of the error codes listed
124   in regex.h.  Obviously the order here has to be same as there.
125   POSIX doesn't require that we do anything for REG_NOERROR,
126   but why not be nice?  */
127
128const char __re_error_msgid[] attribute_hidden =
129  {
130#define REG_NOERROR_IDX	0
131    gettext_noop ("Success")	/* REG_NOERROR */
132    "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134    gettext_noop ("No match")	/* REG_NOMATCH */
135    "\0"
136#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138    "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141    "\0"
142#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144    "\0"
145#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147    "\0"
148#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150    "\0"
151#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153    "\0"
154#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156    "\0"
157#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159    "\0"
160#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162    "\0"
163#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164    gettext_noop ("Invalid range end")	/* REG_ERANGE */
165    "\0"
166#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167    gettext_noop ("Memory exhausted") /* REG_ESPACE */
168    "\0"
169#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171    "\0"
172#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173    gettext_noop ("Premature end of regular expression") /* REG_EEND */
174    "\0"
175#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176    gettext_noop ("Regular expression too big") /* REG_ESIZE */
177    "\0"
178#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180  };
181
182const size_t __re_error_msgid_idx[] attribute_hidden =
183  {
184    REG_NOERROR_IDX,
185    REG_NOMATCH_IDX,
186    REG_BADPAT_IDX,
187    REG_ECOLLATE_IDX,
188    REG_ECTYPE_IDX,
189    REG_EESCAPE_IDX,
190    REG_ESUBREG_IDX,
191    REG_EBRACK_IDX,
192    REG_EPAREN_IDX,
193    REG_EBRACE_IDX,
194    REG_BADBR_IDX,
195    REG_ERANGE_IDX,
196    REG_ESPACE_IDX,
197    REG_BADRPT_IDX,
198    REG_EEND_IDX,
199    REG_ESIZE_IDX,
200    REG_ERPAREN_IDX
201  };
202
203/* Entry points for GNU code.  */
204
205/* re_compile_pattern is the GNU regular expression compiler: it
206   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207   Returns 0 if the pattern was valid, otherwise an error string.
208
209   Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210   are set in BUFP on entry.  */
211
212const char *
213re_compile_pattern (const char *pattern, size_t length,
214		    struct re_pattern_buffer *bufp)
215{
216  reg_errcode_t ret;
217
218  /* And GNU code determines whether or not to get register information
219     by passing null for the REGS argument to re_match, etc., not by
220     setting re_no_sub, unless REG_NO_SUB is set.  */
221  bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
222
223  /* Match anchors at newline.  */
224  bufp->re_newline_anchor = 1;
225
226  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
227
228  if (!ret)
229    return NULL;
230  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
231}
232#ifdef _LIBC
233weak_alias (__re_compile_pattern, re_compile_pattern)
234#endif
235
236/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
237   also be assigned to arbitrarily: each pattern buffer stores its own
238   syntax, so it can be changed between regex compilations.  */
239/* This has no initializer because initialized variables in Emacs
240   become read-only after dumping.  */
241reg_syntax_t re_syntax_options;
242
243
244/* Specify the precise syntax of regexps for compilation.  This provides
245   for compatibility for various utilities which historically have
246   different, incompatible syntaxes.
247
248   The argument SYNTAX is a bit mask comprised of the various bits
249   defined in regex.h.  We return the old syntax.  */
250
251reg_syntax_t
252re_set_syntax (reg_syntax_t syntax)
253{
254  reg_syntax_t ret = re_syntax_options;
255
256  re_syntax_options = syntax;
257  return ret;
258}
259#ifdef _LIBC
260weak_alias (__re_set_syntax, re_set_syntax)
261#endif
262
263int
264re_compile_fastmap (struct re_pattern_buffer *bufp)
265{
266  re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267  char *fastmap = bufp->re_fastmap;
268
269  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271  if (dfa->init_state != dfa->init_state_word)
272    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273  if (dfa->init_state != dfa->init_state_nl)
274    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275  if (dfa->init_state != dfa->init_state_begbuf)
276    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277  bufp->re_fastmap_accurate = 1;
278  return 0;
279}
280#ifdef _LIBC
281weak_alias (__re_compile_fastmap, re_compile_fastmap)
282#endif
283
284static inline void
285__attribute ((always_inline))
286re_set_fastmap (char *fastmap, bool icase, int ch)
287{
288  fastmap[ch] = 1;
289  if (icase)
290    fastmap[tolower (ch)] = 1;
291}
292
293/* Helper function for re_compile_fastmap.
294   Compile fastmap for the initial_state INIT_STATE.  */
295
296static void
297re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
298			 char *fastmap)
299{
300  re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
301  Idx node_cnt;
302  bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
304    {
305      Idx node = init_state->nodes.elems[node_cnt];
306      re_token_type_t type = dfa->nodes[node].type;
307
308      if (type == CHARACTER)
309	{
310	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311#ifdef RE_ENABLE_I18N
312	  if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
313	    {
314	      unsigned char buf[MB_LEN_MAX];
315	      unsigned char *p;
316	      wchar_t wc;
317	      mbstate_t state;
318
319	      p = buf;
320	      *p++ = dfa->nodes[node].opr.c;
321	      while (++node < dfa->nodes_len
322		     &&	dfa->nodes[node].type == CHARACTER
323		     && dfa->nodes[node].mb_partial)
324		*p++ = dfa->nodes[node].opr.c;
325	      memset (&state, 0, sizeof (state));
326	      if (mbrtowc (&wc, (const char *) buf, p - buf,
327			   &state) == p - buf
328		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
329		      != (size_t) -1))
330		re_set_fastmap (fastmap, false, buf[0]);
331	    }
332#endif
333	}
334      else if (type == SIMPLE_BRACKET)
335	{
336	  int i, j, ch;
337	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
339	      if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j))
340		re_set_fastmap (fastmap, icase, ch);
341	}
342#ifdef RE_ENABLE_I18N
343      else if (type == COMPLEX_BRACKET)
344	{
345	  Idx i;
346	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
347	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
348	      || cset->nranges || cset->nchar_classes)
349	    {
350# ifdef _LIBC
351	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
352		{
353		  /* In this case we want to catch the bytes which are
354		     the first byte of any collation elements.
355		     e.g. In da_DK, we want to catch 'a' since "aa"
356			  is a valid collation element, and don't catch
357			  'b' since 'b' is the only collation element
358			  which starts from 'b'.  */
359		  const int32_t *table = (const int32_t *)
360		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
361		  for (i = 0; i < SBC_MAX; ++i)
362		    if (table[i] < 0)
363		      re_set_fastmap (fastmap, icase, i);
364		}
365# else
366	      if (dfa->mb_cur_max > 1)
367		for (i = 0; i < SBC_MAX; ++i)
368		  if (__btowc (i) == WEOF)
369		    re_set_fastmap (fastmap, icase, i);
370# endif /* not _LIBC */
371	    }
372	  for (i = 0; i < cset->nmbchars; ++i)
373	    {
374	      char buf[256];
375	      mbstate_t state;
376	      memset (&state, '\0', sizeof (state));
377	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
378		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
379	      if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
380		{
381		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
382		      != (size_t) -1)
383		    re_set_fastmap (fastmap, false, *(unsigned char *) buf);
384		}
385	    }
386	}
387#endif /* RE_ENABLE_I18N */
388      else if (type == OP_PERIOD
389#ifdef RE_ENABLE_I18N
390	       || type == OP_UTF8_PERIOD
391#endif /* RE_ENABLE_I18N */
392	       || type == END_OF_RE)
393	{
394	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
395	  if (type == END_OF_RE)
396	    bufp->re_can_be_null = 1;
397	  return;
398	}
399    }
400}
401
402/* Entry point for POSIX code.  */
403/* regcomp takes a regular expression as a string and compiles it.
404
405   PREG is a regex_t *.  We do not expect any fields to be initialized,
406   since POSIX says we shouldn't.  Thus, we set
407
408     `re_buffer' to the compiled pattern;
409     `re_used' to the length of the compiled pattern;
410     `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
411       REG_EXTENDED bit in CFLAGS is set; otherwise, to
412       REG_SYNTAX_POSIX_BASIC;
413     `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
414     `re_fastmap' to an allocated space for the fastmap;
415     `re_fastmap_accurate' to zero;
416     `re_nsub' to the number of subexpressions in PATTERN.
417
418   PATTERN is the address of the pattern string.
419
420   CFLAGS is a series of bits which affect compilation.
421
422     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
423     use POSIX basic syntax.
424
425     If REG_NEWLINE is set, then . and [^...] don't match newline.
426     Also, regexec will try a match beginning after every newline.
427
428     If REG_ICASE is set, then we considers upper- and lowercase
429     versions of letters to be equivalent when matching.
430
431     If REG_NOSUB is set, then when PREG is passed to regexec, that
432     routine will report only success or failure, and nothing about the
433     registers.
434
435   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
436   the return codes and their meanings.)  */
437
438int
439regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
440{
441  reg_errcode_t ret;
442  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
443			: REG_SYNTAX_POSIX_BASIC);
444
445  preg->re_buffer = NULL;
446  preg->re_allocated = 0;
447  preg->re_used = 0;
448
449  /* Try to allocate space for the fastmap.  */
450  preg->re_fastmap = re_malloc (char, SBC_MAX);
451  if (BE (preg->re_fastmap == NULL, 0))
452    return REG_ESPACE;
453
454  syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
455
456  /* If REG_NEWLINE is set, newlines are treated differently.  */
457  if (cflags & REG_NEWLINE)
458    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
459      syntax &= ~REG_DOT_NEWLINE;
460      syntax |= REG_HAT_LISTS_NOT_NEWLINE;
461      /* It also changes the matching behavior.  */
462      preg->re_newline_anchor = 1;
463    }
464  else
465    preg->re_newline_anchor = 0;
466  preg->re_no_sub = !!(cflags & REG_NOSUB);
467  preg->re_translate = NULL;
468
469  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
470
471  /* POSIX doesn't distinguish between an unmatched open-group and an
472     unmatched close-group: both are REG_EPAREN.  */
473  if (ret == REG_ERPAREN)
474    ret = REG_EPAREN;
475
476  /* We have already checked preg->re_fastmap != NULL.  */
477  if (BE (ret == REG_NOERROR, 1))
478    /* Compute the fastmap now, since regexec cannot modify the pattern
479       buffer.  This function never fails in this implementation.  */
480    (void) re_compile_fastmap (preg);
481  else
482    {
483      /* Some error occurred while compiling the expression.  */
484      re_free (preg->re_fastmap);
485      preg->re_fastmap = NULL;
486    }
487
488  return (int) ret;
489}
490#ifdef _LIBC
491weak_alias (__regcomp, regcomp)
492#endif
493
494/* Returns a message corresponding to an error code, ERRCODE, returned
495   from either regcomp or regexec.   We don't use PREG here.  */
496
497size_t
498regerror (int errcode, const regex_t *__restrict preg,
499	  char *__restrict errbuf, size_t errbuf_size)
500{
501  const char *msg;
502  size_t msg_size;
503
504  if (BE (errcode < 0
505	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
506			       / sizeof (__re_error_msgid_idx[0])), 0))
507    /* Only error codes returned by the rest of the code should be passed
508       to this routine.  If we are given anything else, or if other regex
509       code generates an invalid error code, then the program has a bug.
510       Dump core so we can fix it.  */
511    abort ();
512
513  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
514
515  msg_size = strlen (msg) + 1; /* Includes the null.  */
516
517  if (BE (errbuf_size != 0, 1))
518    {
519      if (BE (msg_size > errbuf_size, 0))
520	{
521#if defined HAVE_MEMPCPY || defined _LIBC
522	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
523#else
524	  memcpy (errbuf, msg, errbuf_size - 1);
525	  errbuf[errbuf_size - 1] = 0;
526#endif
527	}
528      else
529	memcpy (errbuf, msg, msg_size);
530    }
531
532  return msg_size;
533}
534#ifdef _LIBC
535weak_alias (__regerror, regerror)
536#endif
537
538
539#ifdef RE_ENABLE_I18N
540/* This static array is used for the map to single-byte characters when
541   UTF-8 is used.  Otherwise we would allocate memory just to initialize
542   it the same all the time.  UTF-8 is the preferred encoding so this is
543   a worthwhile optimization.  */
544static const bitset utf8_sb_map =
545{
546  /* Set the first 128 bits.  */
547# if 2 < BITSET_WORDS
548  BITSET_WORD_MAX,
549# endif
550# if 4 < BITSET_WORDS
551  BITSET_WORD_MAX,
552# endif
553# if 6 < BITSET_WORDS
554  BITSET_WORD_MAX,
555# endif
556# if 8 < BITSET_WORDS
557#  error "Invalid BITSET_WORDS"
558# endif
559  (BITSET_WORD_MAX
560   >> (SBC_MAX % BITSET_WORD_BITS == 0
561       ? 0
562       : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
563};
564#endif
565
566
567static void
568free_dfa_content (re_dfa_t *dfa)
569{
570  Idx i, j;
571
572  if (dfa->nodes)
573    for (i = 0; i < dfa->nodes_len; ++i)
574      free_token (dfa->nodes + i);
575  re_free (dfa->nexts);
576  for (i = 0; i < dfa->nodes_len; ++i)
577    {
578      if (dfa->eclosures != NULL)
579	re_node_set_free (dfa->eclosures + i);
580      if (dfa->inveclosures != NULL)
581	re_node_set_free (dfa->inveclosures + i);
582      if (dfa->edests != NULL)
583	re_node_set_free (dfa->edests + i);
584    }
585  re_free (dfa->edests);
586  re_free (dfa->eclosures);
587  re_free (dfa->inveclosures);
588  re_free (dfa->nodes);
589
590  if (dfa->state_table)
591    for (i = 0; i <= dfa->state_hash_mask; ++i)
592      {
593	struct re_state_table_entry *entry = dfa->state_table + i;
594	for (j = 0; j < entry->num; ++j)
595	  {
596	    re_dfastate_t *state = entry->array[j];
597	    free_state (state);
598	  }
599        re_free (entry->array);
600      }
601  re_free (dfa->state_table);
602#ifdef RE_ENABLE_I18N
603  if (dfa->sb_char != utf8_sb_map)
604    re_free (dfa->sb_char);
605#endif
606  re_free (dfa->subexp_map);
607#ifdef DEBUG
608  re_free (dfa->re_str);
609#endif
610
611  re_free (dfa);
612}
613
614
615/* Free dynamically allocated space used by PREG.  */
616
617void
618regfree (regex_t *preg)
619{
620  re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
621  if (BE (dfa != NULL, 1))
622    free_dfa_content (dfa);
623  preg->re_buffer = NULL;
624  preg->re_allocated = 0;
625
626  re_free (preg->re_fastmap);
627  preg->re_fastmap = NULL;
628
629  re_free (preg->re_translate);
630  preg->re_translate = NULL;
631}
632#ifdef _LIBC
633weak_alias (__regfree, regfree)
634#endif
635
636/* Entry points compatible with 4.2 BSD regex library.  We don't define
637   them unless specifically requested.  */
638
639#if defined _REGEX_RE_COMP || defined _LIBC
640
641/* BSD has one and only one pattern buffer.  */
642static struct re_pattern_buffer re_comp_buf;
643
644char *
645# ifdef _LIBC
646/* Make these definitions weak in libc, so POSIX programs can redefine
647   these names if they don't use our functions, and still use
648   regcomp/regexec above without link errors.  */
649weak_function
650# endif
651re_comp (const char *s)
652{
653  reg_errcode_t ret;
654  char *fastmap;
655
656  if (!s)
657    {
658      if (!re_comp_buf.re_buffer)
659	return gettext ("No previous regular expression");
660      return 0;
661    }
662
663  if (re_comp_buf.re_buffer)
664    {
665      fastmap = re_comp_buf.re_fastmap;
666      re_comp_buf.re_fastmap = NULL;
667      __regfree (&re_comp_buf);
668      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
669      re_comp_buf.re_fastmap = fastmap;
670    }
671
672  if (re_comp_buf.re_fastmap == NULL)
673    {
674      re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
675      if (re_comp_buf.re_fastmap == NULL)
676	return (char *) gettext (__re_error_msgid
677				 + __re_error_msgid_idx[(int) REG_ESPACE]);
678    }
679
680  /* Since `re_exec' always passes NULL for the `regs' argument, we
681     don't need to initialize the pattern buffer fields which affect it.  */
682
683  /* Match anchors at newlines.  */
684  re_comp_buf.re_newline_anchor = 1;
685
686  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
687
688  if (!ret)
689    return NULL;
690
691  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
692  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
693}
694
695#ifdef _LIBC
696libc_freeres_fn (free_mem)
697{
698  __regfree (&re_comp_buf);
699}
700#endif
701
702#endif /* _REGEX_RE_COMP */
703
704/* Internal entry point.
705   Compile the regular expression PATTERN, whose length is LENGTH.
706   SYNTAX indicate regular expression's syntax.  */
707
708static reg_errcode_t
709re_compile_internal (regex_t *preg, const char *pattern, Idx length,
710		     reg_syntax_t syntax)
711{
712  reg_errcode_t err = REG_NOERROR;
713  re_dfa_t *dfa;
714  re_string_t regexp;
715
716  /* Initialize the pattern buffer.  */
717  preg->re_fastmap_accurate = 0;
718  preg->re_syntax = syntax;
719  preg->re_not_bol = preg->re_not_eol = 0;
720  preg->re_used = 0;
721  preg->re_nsub = 0;
722  preg->re_can_be_null = 0;
723  preg->re_regs_allocated = REG_UNALLOCATED;
724
725  /* Initialize the dfa.  */
726  dfa = (re_dfa_t *) preg->re_buffer;
727  if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
728    {
729      /* If zero allocated, but buffer is non-null, try to realloc
730	 enough space.  This loses if buffer's address is bogus, but
731	 that is the user's responsibility.  If buffer is null this
732	 is a simple allocation.  */
733      dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
734      if (dfa == NULL)
735	return REG_ESPACE;
736      preg->re_allocated = sizeof (re_dfa_t);
737      preg->re_buffer = (unsigned char *) dfa;
738    }
739  preg->re_used = sizeof (re_dfa_t);
740
741  __libc_lock_init (dfa->lock);
742
743  err = init_dfa (dfa, length);
744  if (BE (err != REG_NOERROR, 0))
745    {
746      free_dfa_content (dfa);
747      preg->re_buffer = NULL;
748      preg->re_allocated = 0;
749      return err;
750    }
751#ifdef DEBUG
752  dfa->re_str = re_malloc (char, length + 1);
753  strncpy (dfa->re_str, pattern, length + 1);
754#endif
755
756  err = re_string_construct (&regexp, pattern, length, preg->re_translate,
757			     syntax & REG_IGNORE_CASE, dfa);
758  if (BE (err != REG_NOERROR, 0))
759    {
760    re_compile_internal_free_return:
761      free_workarea_compile (preg);
762      re_string_destruct (&regexp);
763      free_dfa_content (dfa);
764      preg->re_buffer = NULL;
765      preg->re_allocated = 0;
766      return err;
767    }
768
769  /* Parse the regular expression, and build a structure tree.  */
770  preg->re_nsub = 0;
771  dfa->str_tree = parse (&regexp, preg, syntax, &err);
772  if (BE (dfa->str_tree == NULL, 0))
773    goto re_compile_internal_free_return;
774
775  /* Analyze the tree and create the nfa.  */
776  err = analyze (preg);
777  if (BE (err != REG_NOERROR, 0))
778    goto re_compile_internal_free_return;
779
780#ifdef RE_ENABLE_I18N
781  /* If possible, do searching in single byte encoding to speed things up.  */
782  if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
783    optimize_utf8 (dfa);
784#endif
785
786  /* Then create the initial state of the dfa.  */
787  err = create_initial_state (dfa);
788
789  /* Release work areas.  */
790  free_workarea_compile (preg);
791  re_string_destruct (&regexp);
792
793  if (BE (err != REG_NOERROR, 0))
794    {
795      free_dfa_content (dfa);
796      preg->re_buffer = NULL;
797      preg->re_allocated = 0;
798    }
799
800  return err;
801}
802
803/* Initialize DFA.  We use the length of the regular expression PAT_LEN
804   as the initial length of some arrays.  */
805
806static reg_errcode_t
807init_dfa (re_dfa_t *dfa, Idx pat_len)
808{
809  __re_size_t table_size;
810#ifndef _LIBC
811  char *codeset_name;
812#endif
813
814  memset (dfa, '\0', sizeof (re_dfa_t));
815
816  /* Force allocation of str_tree_storage the first time.  */
817  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
818
819  dfa->nodes_alloc = pat_len + 1;
820  dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc);
821
822  /*  table_size = 2 ^ ceil(log pat_len) */
823  for (table_size = 1; table_size <= pat_len; table_size <<= 1)
824    if (0 < (Idx) -1 && table_size == 0)
825      return REG_ESPACE;
826
827  dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
828  dfa->state_hash_mask = table_size - 1;
829
830  dfa->mb_cur_max = MB_CUR_MAX;
831#ifdef _LIBC
832  if (dfa->mb_cur_max == 6
833      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
834    dfa->is_utf8 = 1;
835  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
836		       != 0);
837#else
838# ifdef HAVE_LANGINFO_CODESET
839  codeset_name = nl_langinfo (CODESET);
840# else
841  codeset_name = getenv ("LC_ALL");
842  if (codeset_name == NULL || codeset_name[0] == '\0')
843    codeset_name = getenv ("LC_CTYPE");
844  if (codeset_name == NULL || codeset_name[0] == '\0')
845    codeset_name = getenv ("LANG");
846  if (codeset_name == NULL)
847    codeset_name = "";
848  else if (strchr (codeset_name, '.') !=  NULL)
849    codeset_name = strchr (codeset_name, '.') + 1;
850# endif
851
852  if (strcasecmp (codeset_name, "UTF-8") == 0
853      || strcasecmp (codeset_name, "UTF8") == 0)
854    dfa->is_utf8 = 1;
855
856  /* We check exhaustively in the loop below if this charset is a
857     superset of ASCII.  */
858  dfa->map_notascii = 0;
859#endif
860
861#ifdef RE_ENABLE_I18N
862  if (dfa->mb_cur_max > 1)
863    {
864      if (dfa->is_utf8)
865	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
866      else
867	{
868	  int i, j, ch;
869
870	  dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS);
871	  if (BE (dfa->sb_char == NULL, 0))
872	    return REG_ESPACE;
873
874	  /* Set the bits corresponding to single byte chars.  */
875	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
876	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
877	      {
878		wint_t wch = __btowc (ch);
879		if (wch != WEOF)
880		  dfa->sb_char[i] |= (bitset_word) 1 << j;
881# ifndef _LIBC
882		if (isascii (ch) && wch != ch)
883		  dfa->map_notascii = 1;
884# endif
885	      }
886	}
887    }
888#endif
889
890  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
891    return REG_ESPACE;
892  return REG_NOERROR;
893}
894
895/* Initialize WORD_CHAR table, which indicate which character is
896   "word".  In this case "word" means that it is the word construction
897   character used by some operators like "\<", "\>", etc.  */
898
899static void
900init_word_char (re_dfa_t *dfa)
901{
902  int i, j, ch;
903  dfa->word_ops_used = 1;
904  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
905    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
906      if (isalnum (ch) || ch == '_')
907	dfa->word_char[i] |= (bitset_word) 1 << j;
908}
909
910/* Free the work area which are only used while compiling.  */
911
912static void
913free_workarea_compile (regex_t *preg)
914{
915  re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
916  bin_tree_storage_t *storage, *next;
917  for (storage = dfa->str_tree_storage; storage; storage = next)
918    {
919      next = storage->next;
920      re_free (storage);
921    }
922  dfa->str_tree_storage = NULL;
923  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
924  dfa->str_tree = NULL;
925  re_free (dfa->org_indices);
926  dfa->org_indices = NULL;
927}
928
929/* Create initial states for all contexts.  */
930
931static reg_errcode_t
932create_initial_state (re_dfa_t *dfa)
933{
934  Idx first, i;
935  reg_errcode_t err;
936  re_node_set init_nodes;
937
938  /* Initial states have the epsilon closure of the node which is
939     the first node of the regular expression.  */
940  first = dfa->str_tree->first->node_idx;
941  dfa->init_node = first;
942  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
943  if (BE (err != REG_NOERROR, 0))
944    return err;
945
946  /* The back-references which are in initial states can epsilon transit,
947     since in this case all of the subexpressions can be null.
948     Then we add epsilon closures of the nodes which are the next nodes of
949     the back-references.  */
950  if (dfa->nbackref > 0)
951    for (i = 0; i < init_nodes.nelem; ++i)
952      {
953	Idx node_idx = init_nodes.elems[i];
954	re_token_type_t type = dfa->nodes[node_idx].type;
955
956	Idx clexp_idx;
957	if (type != OP_BACK_REF)
958	  continue;
959	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
960	  {
961	    re_token_t *clexp_node;
962	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
963	    if (clexp_node->type == OP_CLOSE_SUBEXP
964		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
965	      break;
966	  }
967	if (clexp_idx == init_nodes.nelem)
968	  continue;
969
970	if (type == OP_BACK_REF)
971	  {
972	    Idx dest_idx = dfa->edests[node_idx].elems[0];
973	    if (!re_node_set_contains (&init_nodes, dest_idx))
974	      {
975		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
976		i = 0;
977	      }
978	  }
979      }
980
981  /* It must be the first time to invoke acquire_state.  */
982  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
983  /* We don't check ERR here, since the initial state must not be NULL.  */
984  if (BE (dfa->init_state == NULL, 0))
985    return err;
986  if (dfa->init_state->has_constraint)
987    {
988      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
989						       CONTEXT_WORD);
990      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
991						     CONTEXT_NEWLINE);
992      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
993							 &init_nodes,
994							 CONTEXT_NEWLINE
995							 | CONTEXT_BEGBUF);
996      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
997	      || dfa->init_state_begbuf == NULL, 0))
998	return err;
999    }
1000  else
1001    dfa->init_state_word = dfa->init_state_nl
1002      = dfa->init_state_begbuf = dfa->init_state;
1003
1004  re_node_set_free (&init_nodes);
1005  return REG_NOERROR;
1006}
1007
1008#ifdef RE_ENABLE_I18N
1009/* If it is possible to do searching in single byte encoding instead of UTF-8
1010   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1011   DFA nodes where needed.  */
1012
1013static void
1014optimize_utf8 (re_dfa_t *dfa)
1015{
1016  Idx node;
1017  int i;
1018  bool mb_chars = false;
1019  bool has_period = false;
1020
1021  for (node = 0; node < dfa->nodes_len; ++node)
1022    switch (dfa->nodes[node].type)
1023      {
1024      case CHARACTER:
1025	if (dfa->nodes[node].opr.c >= 0x80)
1026	  mb_chars = true;
1027	break;
1028      case ANCHOR:
1029	switch (dfa->nodes[node].opr.idx)
1030	  {
1031	  case LINE_FIRST:
1032	  case LINE_LAST:
1033	  case BUF_FIRST:
1034	  case BUF_LAST:
1035	    break;
1036	  default:
1037	    /* Word anchors etc. cannot be handled.  */
1038	    return;
1039	  }
1040	break;
1041      case OP_PERIOD:
1042        has_period = true;
1043        break;
1044      case OP_BACK_REF:
1045      case OP_ALT:
1046      case END_OF_RE:
1047      case OP_DUP_ASTERISK:
1048      case OP_OPEN_SUBEXP:
1049      case OP_CLOSE_SUBEXP:
1050	break;
1051      case COMPLEX_BRACKET:
1052	return;
1053      case SIMPLE_BRACKET:
1054	/* Just double check.  */
1055	{
1056	  int rshift =
1057	    (SBC_MAX / 2 % BITSET_WORD_BITS == 0
1058	     ? 0
1059	     : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS);
1060	  for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1061	    {
1062	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1063		return;
1064	      rshift = 0;
1065	    }
1066	}
1067	break;
1068      default:
1069	abort ();
1070      }
1071
1072  if (mb_chars || has_period)
1073    for (node = 0; node < dfa->nodes_len; ++node)
1074      {
1075	if (dfa->nodes[node].type == CHARACTER
1076	    && dfa->nodes[node].opr.c >= 0x80)
1077	  dfa->nodes[node].mb_partial = 0;
1078	else if (dfa->nodes[node].type == OP_PERIOD)
1079	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1080      }
1081
1082  /* The search can be in single byte locale.  */
1083  dfa->mb_cur_max = 1;
1084  dfa->is_utf8 = 0;
1085  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1086}
1087#endif
1088
1089/* Analyze the structure tree, and calculate "first", "next", "edest",
1090   "eclosure", and "inveclosure".  */
1091
1092static reg_errcode_t
1093analyze (regex_t *preg)
1094{
1095  re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1096  reg_errcode_t ret;
1097
1098  /* Allocate arrays.  */
1099  dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1100  dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1101  dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc);
1102  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1103  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1104	  || dfa->eclosures == NULL, 0))
1105    return REG_ESPACE;
1106
1107  dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub);
1108  if (dfa->subexp_map != NULL)
1109    {
1110      Idx i;
1111      for (i = 0; i < preg->re_nsub; i++)
1112	dfa->subexp_map[i] = i;
1113      preorder (dfa->str_tree, optimize_subexps, dfa);
1114      for (i = 0; i < preg->re_nsub; i++)
1115	if (dfa->subexp_map[i] != i)
1116	  break;
1117      if (i == preg->re_nsub)
1118	{
1119	  free (dfa->subexp_map);
1120	  dfa->subexp_map = NULL;
1121	}
1122    }
1123
1124  ret = postorder (dfa->str_tree, lower_subexps, preg);
1125  if (BE (ret != REG_NOERROR, 0))
1126    return ret;
1127  ret = postorder (dfa->str_tree, calc_first, dfa);
1128  if (BE (ret != REG_NOERROR, 0))
1129    return ret;
1130  preorder (dfa->str_tree, calc_next, dfa);
1131  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1132  if (BE (ret != REG_NOERROR, 0))
1133    return ret;
1134  ret = calc_eclosure (dfa);
1135  if (BE (ret != REG_NOERROR, 0))
1136    return ret;
1137
1138  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1139     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1140  if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1141      || dfa->nbackref)
1142    {
1143      dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len);
1144      if (BE (dfa->inveclosures == NULL, 0))
1145        return REG_ESPACE;
1146      ret = calc_inveclosure (dfa);
1147    }
1148
1149  return ret;
1150}
1151
1152/* Our parse trees are very unbalanced, so we cannot use a stack to
1153   implement parse tree visits.  Instead, we use parent pointers and
1154   some hairy code in these two functions.  */
1155static reg_errcode_t
1156postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1157	   void *extra)
1158{
1159  bin_tree_t *node, *prev;
1160
1161  for (node = root; ; )
1162    {
1163      /* Descend down the tree, preferably to the left (or to the right
1164	 if that's the only child).  */
1165      while (node->left || node->right)
1166	if (node->left)
1167          node = node->left;
1168        else
1169          node = node->right;
1170
1171      do
1172	{
1173	  reg_errcode_t err = fn (extra, node);
1174	  if (BE (err != REG_NOERROR, 0))
1175	    return err;
1176          if (node->parent == NULL)
1177	    return REG_NOERROR;
1178	  prev = node;
1179	  node = node->parent;
1180	}
1181      /* Go up while we have a node that is reached from the right.  */
1182      while (node->right == prev || node->right == NULL);
1183      node = node->right;
1184    }
1185}
1186
1187static reg_errcode_t
1188preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189	  void *extra)
1190{
1191  bin_tree_t *node;
1192
1193  for (node = root; ; )
1194    {
1195      reg_errcode_t err = fn (extra, node);
1196      if (BE (err != REG_NOERROR, 0))
1197	return err;
1198
1199      /* Go to the left node, or up and to the right.  */
1200      if (node->left)
1201	node = node->left;
1202      else
1203	{
1204	  bin_tree_t *prev = NULL;
1205	  while (node->right == prev || node->right == NULL)
1206	    {
1207	      prev = node;
1208	      node = node->parent;
1209	      if (!node)
1210	        return REG_NOERROR;
1211	    }
1212	  node = node->right;
1213	}
1214    }
1215}
1216
1217/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1218   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1219   backreferences as well.  Requires a preorder visit.  */
1220static reg_errcode_t
1221optimize_subexps (void *extra, bin_tree_t *node)
1222{
1223  re_dfa_t *dfa = (re_dfa_t *) extra;
1224
1225  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1226    {
1227      int idx = node->token.opr.idx;
1228      node->token.opr.idx = dfa->subexp_map[idx];
1229      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1230    }
1231
1232  else if (node->token.type == SUBEXP
1233           && node->left && node->left->token.type == SUBEXP)
1234    {
1235      Idx other_idx = node->left->token.opr.idx;
1236
1237      node->left = node->left->left;
1238      if (node->left)
1239        node->left->parent = node;
1240
1241      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1242      if (other_idx < BITSET_WORD_BITS)
1243	dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx);
1244    }
1245
1246  return REG_NOERROR;
1247}
1248
1249/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1250   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1251static reg_errcode_t
1252lower_subexps (void *extra, bin_tree_t *node)
1253{
1254  regex_t *preg = (regex_t *) extra;
1255  reg_errcode_t err = REG_NOERROR;
1256
1257  if (node->left && node->left->token.type == SUBEXP)
1258    {
1259      node->left = lower_subexp (&err, preg, node->left);
1260      if (node->left)
1261	node->left->parent = node;
1262    }
1263  if (node->right && node->right->token.type == SUBEXP)
1264    {
1265      node->right = lower_subexp (&err, preg, node->right);
1266      if (node->right)
1267	node->right->parent = node;
1268    }
1269
1270  return err;
1271}
1272
1273static bin_tree_t *
1274lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1275{
1276  re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1277  bin_tree_t *body = node->left;
1278  bin_tree_t *op, *cls, *tree1, *tree;
1279
1280  if (preg->re_no_sub
1281      /* We do not optimize empty subexpressions, because otherwise we may
1282	 have bad CONCAT nodes with NULL children.  This is obviously not
1283	 very common, so we do not lose much.  An example that triggers
1284	 this case is the sed "script" /\(\)/x.  */
1285      && node->left != NULL
1286      && ! (node->token.opr.idx < BITSET_WORD_BITS
1287	    && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx)))
1288    return node->left;
1289
1290  /* Convert the SUBEXP node to the concatenation of an
1291     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1292  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1293  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1294  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1295  tree = create_tree (dfa, op, tree1, CONCAT);
1296  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1297    {
1298      *err = REG_ESPACE;
1299      return NULL;
1300    }
1301
1302  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1303  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1304  return tree;
1305}
1306
1307/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1308   nodes.  Requires a postorder visit.  */
1309static reg_errcode_t
1310calc_first (void *extra, bin_tree_t *node)
1311{
1312  re_dfa_t *dfa = (re_dfa_t *) extra;
1313  if (node->token.type == CONCAT)
1314    {
1315      node->first = node->left->first;
1316      node->node_idx = node->left->node_idx;
1317    }
1318  else
1319    {
1320      node->first = node;
1321      node->node_idx = re_dfa_add_node (dfa, node->token);
1322      if (BE (node->node_idx == REG_MISSING, 0))
1323        return REG_ESPACE;
1324    }
1325  return REG_NOERROR;
1326}
1327
1328/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1329static reg_errcode_t
1330calc_next (void *extra, bin_tree_t *node)
1331{
1332  switch (node->token.type)
1333    {
1334    case OP_DUP_ASTERISK:
1335      node->left->next = node;
1336      break;
1337    case CONCAT:
1338      node->left->next = node->right->first;
1339      node->right->next = node->next;
1340      break;
1341    default:
1342      if (node->left)
1343	node->left->next = node->next;
1344      if (node->right)
1345        node->right->next = node->next;
1346      break;
1347    }
1348  return REG_NOERROR;
1349}
1350
1351/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1352static reg_errcode_t
1353link_nfa_nodes (void *extra, bin_tree_t *node)
1354{
1355  re_dfa_t *dfa = (re_dfa_t *) extra;
1356  Idx idx = node->node_idx;
1357  reg_errcode_t err = REG_NOERROR;
1358
1359  switch (node->token.type)
1360    {
1361    case CONCAT:
1362      break;
1363
1364    case END_OF_RE:
1365      assert (node->next == NULL);
1366      break;
1367
1368    case OP_DUP_ASTERISK:
1369    case OP_ALT:
1370      {
1371	Idx left, right;
1372	dfa->has_plural_match = 1;
1373	if (node->left != NULL)
1374	  left = node->left->first->node_idx;
1375	else
1376	  left = node->next->node_idx;
1377	if (node->right != NULL)
1378	  right = node->right->first->node_idx;
1379	else
1380	  right = node->next->node_idx;
1381	assert (REG_VALID_INDEX (left));
1382	assert (REG_VALID_INDEX (right));
1383	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1384      }
1385      break;
1386
1387    case ANCHOR:
1388    case OP_OPEN_SUBEXP:
1389    case OP_CLOSE_SUBEXP:
1390      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1391      break;
1392
1393    case OP_BACK_REF:
1394      dfa->nexts[idx] = node->next->node_idx;
1395      if (node->token.type == OP_BACK_REF)
1396	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1397      break;
1398
1399    default:
1400      assert (!IS_EPSILON_NODE (node->token.type));
1401      dfa->nexts[idx] = node->next->node_idx;
1402      break;
1403    }
1404
1405  return err;
1406}
1407
1408/* Duplicate the epsilon closure of the node ROOT_NODE.
1409   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1410   to their own constraint.  */
1411
1412static reg_errcode_t
1413duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1414			Idx top_clone_node, Idx root_node,
1415			unsigned int init_constraint)
1416{
1417  Idx org_node, clone_node;
1418  bool ok;
1419  unsigned int constraint = init_constraint;
1420  for (org_node = top_org_node, clone_node = top_clone_node;;)
1421    {
1422      Idx org_dest, clone_dest;
1423      if (dfa->nodes[org_node].type == OP_BACK_REF)
1424	{
1425	  /* If the back reference epsilon-transit, its destination must
1426	     also have the constraint.  Then duplicate the epsilon closure
1427	     of the destination of the back reference, and store it in
1428	     edests of the back reference.  */
1429	  org_dest = dfa->nexts[org_node];
1430	  re_node_set_empty (dfa->edests + clone_node);
1431	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1432	  if (BE (clone_dest == REG_MISSING, 0))
1433	    return REG_ESPACE;
1434	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1435	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1436	  if (BE (! ok, 0))
1437	    return REG_ESPACE;
1438	}
1439      else if (dfa->edests[org_node].nelem == 0)
1440	{
1441	  /* In case of the node can't epsilon-transit, don't duplicate the
1442	     destination and store the original destination as the
1443	     destination of the node.  */
1444	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1445	  break;
1446	}
1447      else if (dfa->edests[org_node].nelem == 1)
1448	{
1449	  /* In case of the node can epsilon-transit, and it has only one
1450	     destination.  */
1451	  org_dest = dfa->edests[org_node].elems[0];
1452	  re_node_set_empty (dfa->edests + clone_node);
1453	  if (dfa->nodes[org_node].type == ANCHOR)
1454	    {
1455	      /* In case of the node has another constraint, append it.  */
1456	      if (org_node == root_node && clone_node != org_node)
1457		{
1458		  /* ...but if the node is root_node itself, it means the
1459		     epsilon closure have a loop, then tie it to the
1460		     destination of the root_node.  */
1461		  ok = re_node_set_insert (dfa->edests + clone_node,
1462					    org_dest);
1463		  if (BE (! ok, 0))
1464		    return REG_ESPACE;
1465		  break;
1466		}
1467	      constraint |= dfa->nodes[org_node].opr.ctx_type;
1468	    }
1469	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1470	  if (BE (clone_dest == REG_MISSING, 0))
1471	    return REG_ESPACE;
1472	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1473	  if (BE (! ok, 0))
1474	    return REG_ESPACE;
1475	}
1476      else /* dfa->edests[org_node].nelem == 2 */
1477	{
1478	  /* In case of the node can epsilon-transit, and it has two
1479	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1480	  org_dest = dfa->edests[org_node].elems[0];
1481	  re_node_set_empty (dfa->edests + clone_node);
1482	  /* Search for a duplicated node which satisfies the constraint.  */
1483	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1484	  if (clone_dest == REG_MISSING)
1485	    {
1486	      /* There are no such a duplicated node, create a new one.  */
1487	      reg_errcode_t err;
1488	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1489	      if (BE (clone_dest == REG_MISSING, 0))
1490		return REG_ESPACE;
1491	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1492	      if (BE (! ok, 0))
1493		return REG_ESPACE;
1494	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1495					    root_node, constraint);
1496	      if (BE (err != REG_NOERROR, 0))
1497		return err;
1498	    }
1499	  else
1500	    {
1501	      /* There are a duplicated node which satisfy the constraint,
1502		 use it to avoid infinite loop.  */
1503	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1504	      if (BE (! ok, 0))
1505		return REG_ESPACE;
1506	    }
1507
1508	  org_dest = dfa->edests[org_node].elems[1];
1509	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1510	  if (BE (clone_dest == REG_MISSING, 0))
1511	    return REG_ESPACE;
1512	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1513	  if (BE (! ok, 0))
1514	    return REG_ESPACE;
1515	}
1516      org_node = org_dest;
1517      clone_node = clone_dest;
1518    }
1519  return REG_NOERROR;
1520}
1521
1522/* Search for a node which is duplicated from the node ORG_NODE, and
1523   satisfies the constraint CONSTRAINT.  */
1524
1525static Idx
1526search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1527			unsigned int constraint)
1528{
1529  Idx idx;
1530  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1531    {
1532      if (org_node == dfa->org_indices[idx]
1533	  && constraint == dfa->nodes[idx].constraint)
1534	return idx; /* Found.  */
1535    }
1536  return REG_MISSING; /* Not found.  */
1537}
1538
1539/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1540   Return the index of the new node, or REG_MISSING if insufficient storage is
1541   available.  */
1542
1543static Idx
1544duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1545{
1546  Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1547  if (BE (dup_idx != REG_MISSING, 1))
1548    {
1549      dfa->nodes[dup_idx].constraint = constraint;
1550      if (dfa->nodes[org_idx].type == ANCHOR)
1551	dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1552      dfa->nodes[dup_idx].duplicated = 1;
1553
1554      /* Store the index of the original node.  */
1555      dfa->org_indices[dup_idx] = org_idx;
1556    }
1557  return dup_idx;
1558}
1559
1560static reg_errcode_t
1561calc_inveclosure (re_dfa_t *dfa)
1562{
1563  Idx src, idx;
1564  bool ok;
1565  for (idx = 0; idx < dfa->nodes_len; ++idx)
1566    re_node_set_init_empty (dfa->inveclosures + idx);
1567
1568  for (src = 0; src < dfa->nodes_len; ++src)
1569    {
1570      Idx *elems = dfa->eclosures[src].elems;
1571      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1572	{
1573	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1574	  if (BE (! ok, 0))
1575	    return REG_ESPACE;
1576	}
1577    }
1578
1579  return REG_NOERROR;
1580}
1581
1582/* Calculate "eclosure" for all the node in DFA.  */
1583
1584static reg_errcode_t
1585calc_eclosure (re_dfa_t *dfa)
1586{
1587  Idx node_idx;
1588  bool incomplete;
1589#ifdef DEBUG
1590  assert (dfa->nodes_len > 0);
1591#endif
1592  incomplete = false;
1593  /* For each nodes, calculate epsilon closure.  */
1594  for (node_idx = 0; ; ++node_idx)
1595    {
1596      reg_errcode_t err;
1597      re_node_set eclosure_elem;
1598      if (node_idx == dfa->nodes_len)
1599	{
1600	  if (!incomplete)
1601	    break;
1602	  incomplete = false;
1603	  node_idx = 0;
1604	}
1605
1606#ifdef DEBUG
1607      assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1608#endif
1609
1610      /* If we have already calculated, skip it.  */
1611      if (dfa->eclosures[node_idx].nelem != 0)
1612	continue;
1613      /* Calculate epsilon closure of `node_idx'.  */
1614      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1615      if (BE (err != REG_NOERROR, 0))
1616	return err;
1617
1618      if (dfa->eclosures[node_idx].nelem == 0)
1619	{
1620	  incomplete = true;
1621	  re_node_set_free (&eclosure_elem);
1622	}
1623    }
1624  return REG_NOERROR;
1625}
1626
1627/* Calculate epsilon closure of NODE.  */
1628
1629static reg_errcode_t
1630calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1631{
1632  reg_errcode_t err;
1633  unsigned int constraint;
1634  Idx i;
1635  bool incomplete;
1636  bool ok;
1637  re_node_set eclosure;
1638  incomplete = false;
1639  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1640  if (BE (err != REG_NOERROR, 0))
1641    return err;
1642
1643  /* This indicates that we are calculating this node now.
1644     We reference this value to avoid infinite loop.  */
1645  dfa->eclosures[node].nelem = REG_MISSING;
1646
1647  constraint = ((dfa->nodes[node].type == ANCHOR)
1648		? dfa->nodes[node].opr.ctx_type : 0);
1649  /* If the current node has constraints, duplicate all nodes.
1650     Since they must inherit the constraints.  */
1651  if (constraint
1652      && dfa->edests[node].nelem
1653      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1654    {
1655      Idx org_node, cur_node;
1656      org_node = cur_node = node;
1657      err = duplicate_node_closure (dfa, node, node, node, constraint);
1658      if (BE (err != REG_NOERROR, 0))
1659	return err;
1660    }
1661
1662  /* Expand each epsilon destination nodes.  */
1663  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1664    for (i = 0; i < dfa->edests[node].nelem; ++i)
1665      {
1666	re_node_set eclosure_elem;
1667	Idx edest = dfa->edests[node].elems[i];
1668	/* If calculating the epsilon closure of `edest' is in progress,
1669	   return intermediate result.  */
1670	if (dfa->eclosures[edest].nelem == REG_MISSING)
1671	  {
1672	    incomplete = true;
1673	    continue;
1674	  }
1675	/* If we haven't calculated the epsilon closure of `edest' yet,
1676	   calculate now. Otherwise use calculated epsilon closure.  */
1677	if (dfa->eclosures[edest].nelem == 0)
1678	  {
1679	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1680	    if (BE (err != REG_NOERROR, 0))
1681	      return err;
1682	  }
1683	else
1684	  eclosure_elem = dfa->eclosures[edest];
1685	/* Merge the epsilon closure of `edest'.  */
1686	re_node_set_merge (&eclosure, &eclosure_elem);
1687	/* If the epsilon closure of `edest' is incomplete,
1688	   the epsilon closure of this node is also incomplete.  */
1689	if (dfa->eclosures[edest].nelem == 0)
1690	  {
1691	    incomplete = true;
1692	    re_node_set_free (&eclosure_elem);
1693	  }
1694      }
1695
1696  /* Epsilon closures include itself.  */
1697  ok = re_node_set_insert (&eclosure, node);
1698  if (BE (! ok, 0))
1699    return REG_ESPACE;
1700  if (incomplete && !root)
1701    dfa->eclosures[node].nelem = 0;
1702  else
1703    dfa->eclosures[node] = eclosure;
1704  *new_set = eclosure;
1705  return REG_NOERROR;
1706}
1707
1708/* Functions for token which are used in the parser.  */
1709
1710/* Fetch a token from INPUT.
1711   We must not use this function inside bracket expressions.  */
1712
1713static void
1714fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1715{
1716  re_string_skip_bytes (input, peek_token (result, input, syntax));
1717}
1718
1719/* Peek a token from INPUT, and return the length of the token.
1720   We must not use this function inside bracket expressions.  */
1721
1722static int
1723peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1724{
1725  unsigned char c;
1726
1727  if (re_string_eoi (input))
1728    {
1729      token->type = END_OF_RE;
1730      return 0;
1731    }
1732
1733  c = re_string_peek_byte (input, 0);
1734  token->opr.c = c;
1735
1736  token->word_char = 0;
1737#ifdef RE_ENABLE_I18N
1738  token->mb_partial = 0;
1739  if (input->mb_cur_max > 1 &&
1740      !re_string_first_byte (input, re_string_cur_idx (input)))
1741    {
1742      token->type = CHARACTER;
1743      token->mb_partial = 1;
1744      return 1;
1745    }
1746#endif
1747  if (c == '\\')
1748    {
1749      unsigned char c2;
1750      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1751	{
1752	  token->type = BACK_SLASH;
1753	  return 1;
1754	}
1755
1756      c2 = re_string_peek_byte_case (input, 1);
1757      token->opr.c = c2;
1758      token->type = CHARACTER;
1759#ifdef RE_ENABLE_I18N
1760      if (input->mb_cur_max > 1)
1761	{
1762	  wint_t wc = re_string_wchar_at (input,
1763					  re_string_cur_idx (input) + 1);
1764	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1765	}
1766      else
1767#endif
1768	token->word_char = IS_WORD_CHAR (c2) != 0;
1769
1770      switch (c2)
1771	{
1772	case '|':
1773	  if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1774	    token->type = OP_ALT;
1775	  break;
1776	case '1': case '2': case '3': case '4': case '5':
1777	case '6': case '7': case '8': case '9':
1778	  if (!(syntax & REG_NO_BK_REFS))
1779	    {
1780	      token->type = OP_BACK_REF;
1781	      token->opr.idx = c2 - '1';
1782	    }
1783	  break;
1784	case '<':
1785	  if (!(syntax & REG_NO_GNU_OPS))
1786	    {
1787	      token->type = ANCHOR;
1788	      token->opr.ctx_type = WORD_FIRST;
1789	    }
1790	  break;
1791	case '>':
1792	  if (!(syntax & REG_NO_GNU_OPS))
1793	    {
1794	      token->type = ANCHOR;
1795	      token->opr.ctx_type = WORD_LAST;
1796	    }
1797	  break;
1798	case 'b':
1799	  if (!(syntax & REG_NO_GNU_OPS))
1800	    {
1801	      token->type = ANCHOR;
1802	      token->opr.ctx_type = WORD_DELIM;
1803	    }
1804	  break;
1805	case 'B':
1806	  if (!(syntax & REG_NO_GNU_OPS))
1807	    {
1808	      token->type = ANCHOR;
1809	      token->opr.ctx_type = NOT_WORD_DELIM;
1810	    }
1811	  break;
1812	case 'w':
1813	  if (!(syntax & REG_NO_GNU_OPS))
1814	    token->type = OP_WORD;
1815	  break;
1816	case 'W':
1817	  if (!(syntax & REG_NO_GNU_OPS))
1818	    token->type = OP_NOTWORD;
1819	  break;
1820	case 's':
1821	  if (!(syntax & REG_NO_GNU_OPS))
1822	    token->type = OP_SPACE;
1823	  break;
1824	case 'S':
1825	  if (!(syntax & REG_NO_GNU_OPS))
1826	    token->type = OP_NOTSPACE;
1827	  break;
1828	case '`':
1829	  if (!(syntax & REG_NO_GNU_OPS))
1830	    {
1831	      token->type = ANCHOR;
1832	      token->opr.ctx_type = BUF_FIRST;
1833	    }
1834	  break;
1835	case '\'':
1836	  if (!(syntax & REG_NO_GNU_OPS))
1837	    {
1838	      token->type = ANCHOR;
1839	      token->opr.ctx_type = BUF_LAST;
1840	    }
1841	  break;
1842	case '(':
1843	  if (!(syntax & REG_NO_BK_PARENS))
1844	    token->type = OP_OPEN_SUBEXP;
1845	  break;
1846	case ')':
1847	  if (!(syntax & REG_NO_BK_PARENS))
1848	    token->type = OP_CLOSE_SUBEXP;
1849	  break;
1850	case '+':
1851	  if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1852	    token->type = OP_DUP_PLUS;
1853	  break;
1854	case '?':
1855	  if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1856	    token->type = OP_DUP_QUESTION;
1857	  break;
1858	case '{':
1859	  if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1860	    token->type = OP_OPEN_DUP_NUM;
1861	  break;
1862	case '}':
1863	  if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1864	    token->type = OP_CLOSE_DUP_NUM;
1865	  break;
1866	default:
1867	  break;
1868	}
1869      return 2;
1870    }
1871
1872  token->type = CHARACTER;
1873#ifdef RE_ENABLE_I18N
1874  if (input->mb_cur_max > 1)
1875    {
1876      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1877      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1878    }
1879  else
1880#endif
1881    token->word_char = IS_WORD_CHAR (token->opr.c);
1882
1883  switch (c)
1884    {
1885    case '\n':
1886      if (syntax & REG_NEWLINE_ALT)
1887	token->type = OP_ALT;
1888      break;
1889    case '|':
1890      if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1891	token->type = OP_ALT;
1892      break;
1893    case '*':
1894      token->type = OP_DUP_ASTERISK;
1895      break;
1896    case '+':
1897      if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1898	token->type = OP_DUP_PLUS;
1899      break;
1900    case '?':
1901      if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1902	token->type = OP_DUP_QUESTION;
1903      break;
1904    case '{':
1905      if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1906	token->type = OP_OPEN_DUP_NUM;
1907      break;
1908    case '}':
1909      if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1910	token->type = OP_CLOSE_DUP_NUM;
1911      break;
1912    case '(':
1913      if (syntax & REG_NO_BK_PARENS)
1914	token->type = OP_OPEN_SUBEXP;
1915      break;
1916    case ')':
1917      if (syntax & REG_NO_BK_PARENS)
1918	token->type = OP_CLOSE_SUBEXP;
1919      break;
1920    case '[':
1921      token->type = OP_OPEN_BRACKET;
1922      break;
1923    case '.':
1924      token->type = OP_PERIOD;
1925      break;
1926    case '^':
1927      if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1928	  re_string_cur_idx (input) != 0)
1929	{
1930	  char prev = re_string_peek_byte (input, -1);
1931	  if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1932	    break;
1933	}
1934      token->type = ANCHOR;
1935      token->opr.ctx_type = LINE_FIRST;
1936      break;
1937    case '$':
1938      if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1939	  re_string_cur_idx (input) + 1 != re_string_length (input))
1940	{
1941	  re_token_t next;
1942	  re_string_skip_bytes (input, 1);
1943	  peek_token (&next, input, syntax);
1944	  re_string_skip_bytes (input, -1);
1945	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1946	    break;
1947	}
1948      token->type = ANCHOR;
1949      token->opr.ctx_type = LINE_LAST;
1950      break;
1951    default:
1952      break;
1953    }
1954  return 1;
1955}
1956
1957/* Peek a token from INPUT, and return the length of the token.
1958   We must not use this function out of bracket expressions.  */
1959
1960static int
1961peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1962{
1963  unsigned char c;
1964  if (re_string_eoi (input))
1965    {
1966      token->type = END_OF_RE;
1967      return 0;
1968    }
1969  c = re_string_peek_byte (input, 0);
1970  token->opr.c = c;
1971
1972#ifdef RE_ENABLE_I18N
1973  if (input->mb_cur_max > 1 &&
1974      !re_string_first_byte (input, re_string_cur_idx (input)))
1975    {
1976      token->type = CHARACTER;
1977      return 1;
1978    }
1979#endif /* RE_ENABLE_I18N */
1980
1981  if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1982      && re_string_cur_idx (input) + 1 < re_string_length (input))
1983    {
1984      /* In this case, '\' escape a character.  */
1985      unsigned char c2;
1986      re_string_skip_bytes (input, 1);
1987      c2 = re_string_peek_byte (input, 0);
1988      token->opr.c = c2;
1989      token->type = CHARACTER;
1990      return 1;
1991    }
1992  if (c == '[') /* '[' is a special char in a bracket exps.  */
1993    {
1994      unsigned char c2;
1995      int token_len;
1996      if (re_string_cur_idx (input) + 1 < re_string_length (input))
1997	c2 = re_string_peek_byte (input, 1);
1998      else
1999	c2 = 0;
2000      token->opr.c = c2;
2001      token_len = 2;
2002      switch (c2)
2003	{
2004	case '.':
2005	  token->type = OP_OPEN_COLL_ELEM;
2006	  break;
2007	case '=':
2008	  token->type = OP_OPEN_EQUIV_CLASS;
2009	  break;
2010	case ':':
2011	  if (syntax & REG_CHAR_CLASSES)
2012	    {
2013	      token->type = OP_OPEN_CHAR_CLASS;
2014	      break;
2015	    }
2016	  /* else fall through.  */
2017	default:
2018	  token->type = CHARACTER;
2019	  token->opr.c = c;
2020	  token_len = 1;
2021	  break;
2022	}
2023      return token_len;
2024    }
2025  switch (c)
2026    {
2027    case '-':
2028      token->type = OP_CHARSET_RANGE;
2029      break;
2030    case ']':
2031      token->type = OP_CLOSE_BRACKET;
2032      break;
2033    case '^':
2034      token->type = OP_NON_MATCH_LIST;
2035      break;
2036    default:
2037      token->type = CHARACTER;
2038    }
2039  return 1;
2040}
2041
2042/* Functions for parser.  */
2043
2044/* Entry point of the parser.
2045   Parse the regular expression REGEXP and return the structure tree.
2046   If an error is occured, ERR is set by error code, and return NULL.
2047   This function build the following tree, from regular expression <reg_exp>:
2048	   CAT
2049	   / \
2050	  /   \
2051   <reg_exp>  EOR
2052
2053   CAT means concatenation.
2054   EOR means end of regular expression.  */
2055
2056static bin_tree_t *
2057parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2058       reg_errcode_t *err)
2059{
2060  re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2061  bin_tree_t *tree, *eor, *root;
2062  re_token_t current_token;
2063  dfa->syntax = syntax;
2064  fetch_token (&current_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2065  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2066  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2067    return NULL;
2068  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2069  if (tree != NULL)
2070    root = create_tree (dfa, tree, eor, CONCAT);
2071  else
2072    root = eor;
2073  if (BE (eor == NULL || root == NULL, 0))
2074    {
2075      *err = REG_ESPACE;
2076      return NULL;
2077    }
2078  return root;
2079}
2080
2081/* This function build the following tree, from regular expression
2082   <branch1>|<branch2>:
2083	   ALT
2084	   / \
2085	  /   \
2086   <branch1> <branch2>
2087
2088   ALT means alternative, which represents the operator `|'.  */
2089
2090static bin_tree_t *
2091parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2092	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2093{
2094  re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2095  bin_tree_t *tree, *branch = NULL;
2096  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2097  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2098    return NULL;
2099
2100  while (token->type == OP_ALT)
2101    {
2102      fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2103      if (token->type != OP_ALT && token->type != END_OF_RE
2104	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2105	{
2106	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2107	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2108	    return NULL;
2109	}
2110      else
2111	branch = NULL;
2112      tree = create_tree (dfa, tree, branch, OP_ALT);
2113      if (BE (tree == NULL, 0))
2114	{
2115	  *err = REG_ESPACE;
2116	  return NULL;
2117	}
2118    }
2119  return tree;
2120}
2121
2122/* This function build the following tree, from regular expression
2123   <exp1><exp2>:
2124	CAT
2125	/ \
2126       /   \
2127   <exp1> <exp2>
2128
2129   CAT means concatenation.  */
2130
2131static bin_tree_t *
2132parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2133	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2134{
2135  bin_tree_t *tree, *exp;
2136  re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2137  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2138  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2139    return NULL;
2140
2141  while (token->type != OP_ALT && token->type != END_OF_RE
2142	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2143    {
2144      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2145      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2146	{
2147	  return NULL;
2148	}
2149      if (tree != NULL && exp != NULL)
2150	{
2151	  tree = create_tree (dfa, tree, exp, CONCAT);
2152	  if (tree == NULL)
2153	    {
2154	      *err = REG_ESPACE;
2155	      return NULL;
2156	    }
2157	}
2158      else if (tree == NULL)
2159	tree = exp;
2160      /* Otherwise exp == NULL, we don't need to create new tree.  */
2161    }
2162  return tree;
2163}
2164
2165/* This function build the following tree, from regular expression a*:
2166	 *
2167	 |
2168	 a
2169*/
2170
2171static bin_tree_t *
2172parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2173		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2174{
2175  re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2176  bin_tree_t *tree;
2177  switch (token->type)
2178    {
2179    case CHARACTER:
2180      tree = create_token_tree (dfa, NULL, NULL, token);
2181      if (BE (tree == NULL, 0))
2182	{
2183	  *err = REG_ESPACE;
2184	  return NULL;
2185	}
2186#ifdef RE_ENABLE_I18N
2187      if (dfa->mb_cur_max > 1)
2188	{
2189	  while (!re_string_eoi (regexp)
2190		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2191	    {
2192	      bin_tree_t *mbc_remain;
2193	      fetch_token (token, regexp, syntax);
2194	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2195	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2196	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2197		{
2198		  *err = REG_ESPACE;
2199		  return NULL;
2200		}
2201	    }
2202	}
2203#endif
2204      break;
2205    case OP_OPEN_SUBEXP:
2206      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2207      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2208	return NULL;
2209      break;
2210    case OP_OPEN_BRACKET:
2211      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2212      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2213	return NULL;
2214      break;
2215    case OP_BACK_REF:
2216      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2217	{
2218	  *err = REG_ESUBREG;
2219	  return NULL;
2220	}
2221      dfa->used_bkref_map |= 1 << token->opr.idx;
2222      tree = create_token_tree (dfa, NULL, NULL, token);
2223      if (BE (tree == NULL, 0))
2224	{
2225	  *err = REG_ESPACE;
2226	  return NULL;
2227	}
2228      ++dfa->nbackref;
2229      dfa->has_mb_node = 1;
2230      break;
2231    case OP_OPEN_DUP_NUM:
2232      if (syntax & REG_CONTEXT_INVALID_DUP)
2233	{
2234	  *err = REG_BADRPT;
2235	  return NULL;
2236	}
2237      /* FALLTHROUGH */
2238    case OP_DUP_ASTERISK:
2239    case OP_DUP_PLUS:
2240    case OP_DUP_QUESTION:
2241      if (syntax & REG_CONTEXT_INVALID_OPS)
2242	{
2243	  *err = REG_BADRPT;
2244	  return NULL;
2245	}
2246      else if (syntax & REG_CONTEXT_INDEP_OPS)
2247	{
2248	  fetch_token (token, regexp, syntax);
2249	  return parse_expression (regexp, preg, token, syntax, nest, err);
2250	}
2251      /* else fall through  */
2252    case OP_CLOSE_SUBEXP:
2253      if ((token->type == OP_CLOSE_SUBEXP) &&
2254	  !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2255	{
2256	  *err = REG_ERPAREN;
2257	  return NULL;
2258	}
2259      /* else fall through  */
2260    case OP_CLOSE_DUP_NUM:
2261      /* We treat it as a normal character.  */
2262
2263      /* Then we can these characters as normal characters.  */
2264      token->type = CHARACTER;
2265      /* mb_partial and word_char bits should be initialized already
2266	 by peek_token.  */
2267      tree = create_token_tree (dfa, NULL, NULL, token);
2268      if (BE (tree == NULL, 0))
2269	{
2270	  *err = REG_ESPACE;
2271	  return NULL;
2272	}
2273      break;
2274    case ANCHOR:
2275      if ((token->opr.ctx_type
2276	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2277	  && dfa->word_ops_used == 0)
2278	init_word_char (dfa);
2279      if (token->opr.ctx_type == WORD_DELIM
2280          || token->opr.ctx_type == NOT_WORD_DELIM)
2281	{
2282	  bin_tree_t *tree_first, *tree_last;
2283	  if (token->opr.ctx_type == WORD_DELIM)
2284	    {
2285	      token->opr.ctx_type = WORD_FIRST;
2286	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2287	      token->opr.ctx_type = WORD_LAST;
2288            }
2289          else
2290            {
2291	      token->opr.ctx_type = INSIDE_WORD;
2292	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2293	      token->opr.ctx_type = INSIDE_NOTWORD;
2294            }
2295	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2296	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2297	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2298	    {
2299	      *err = REG_ESPACE;
2300	      return NULL;
2301	    }
2302	}
2303      else
2304	{
2305	  tree = create_token_tree (dfa, NULL, NULL, token);
2306	  if (BE (tree == NULL, 0))
2307	    {
2308	      *err = REG_ESPACE;
2309	      return NULL;
2310	    }
2311	}
2312      /* We must return here, since ANCHORs can't be followed
2313	 by repetition operators.
2314	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2315	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2316      fetch_token (token, regexp, syntax);
2317      return tree;
2318    case OP_PERIOD:
2319      tree = create_token_tree (dfa, NULL, NULL, token);
2320      if (BE (tree == NULL, 0))
2321	{
2322	  *err = REG_ESPACE;
2323	  return NULL;
2324	}
2325      if (dfa->mb_cur_max > 1)
2326	dfa->has_mb_node = 1;
2327      break;
2328    case OP_WORD:
2329    case OP_NOTWORD:
2330      tree = build_charclass_op (dfa, regexp->trans,
2331				 (const unsigned char *) "alnum",
2332				 (const unsigned char *) "_",
2333				 token->type == OP_NOTWORD, err);
2334      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2335	return NULL;
2336      break;
2337    case OP_SPACE:
2338    case OP_NOTSPACE:
2339      tree = build_charclass_op (dfa, regexp->trans,
2340				 (const unsigned char *) "space",
2341				 (const unsigned char *) "",
2342				 token->type == OP_NOTSPACE, err);
2343      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2344	return NULL;
2345      break;
2346    case OP_ALT:
2347    case END_OF_RE:
2348      return NULL;
2349    case BACK_SLASH:
2350      *err = REG_EESCAPE;
2351      return NULL;
2352    default:
2353      /* Must not happen?  */
2354#ifdef DEBUG
2355      assert (0);
2356#endif
2357      return NULL;
2358    }
2359  fetch_token (token, regexp, syntax);
2360
2361  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2362	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2363    {
2364      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2365      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2366	return NULL;
2367      /* In BRE consecutive duplications are not allowed.  */
2368      if ((syntax & REG_CONTEXT_INVALID_DUP)
2369	  && (token->type == OP_DUP_ASTERISK
2370	      || token->type == OP_OPEN_DUP_NUM))
2371	{
2372	  *err = REG_BADRPT;
2373	  return NULL;
2374	}
2375    }
2376
2377  return tree;
2378}
2379
2380/* This function build the following tree, from regular expression
2381   (<reg_exp>):
2382	 SUBEXP
2383	    |
2384	<reg_exp>
2385*/
2386
2387static bin_tree_t *
2388parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2389	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2390{
2391  re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2392  bin_tree_t *tree;
2393  size_t cur_nsub;
2394  cur_nsub = preg->re_nsub++;
2395
2396  fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2397
2398  /* The subexpression may be a null string.  */
2399  if (token->type == OP_CLOSE_SUBEXP)
2400    tree = NULL;
2401  else
2402    {
2403      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2404      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2405        *err = REG_EPAREN;
2406      if (BE (*err != REG_NOERROR, 0))
2407	return NULL;
2408    }
2409
2410  if (cur_nsub <= '9' - '1')
2411    dfa->completed_bkref_map |= 1 << cur_nsub;
2412
2413  tree = create_tree (dfa, tree, NULL, SUBEXP);
2414  if (BE (tree == NULL, 0))
2415    {
2416      *err = REG_ESPACE;
2417      return NULL;
2418    }
2419  tree->token.opr.idx = cur_nsub;
2420  return tree;
2421}
2422
2423/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2424
2425static bin_tree_t *
2426parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2427	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2428{
2429  bin_tree_t *tree = NULL, *old_tree = NULL;
2430  Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2431  re_token_t start_token = *token;
2432
2433  if (token->type == OP_OPEN_DUP_NUM)
2434    {
2435      end = 0;
2436      start = fetch_number (regexp, token, syntax);
2437      if (start == REG_MISSING)
2438	{
2439	  if (token->type == CHARACTER && token->opr.c == ',')
2440	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2441	  else
2442	    {
2443	      *err = REG_BADBR; /* <re>{} is invalid.  */
2444	      return NULL;
2445	    }
2446	}
2447      if (BE (start != REG_ERROR, 1))
2448	{
2449	  /* We treat "{n}" as "{n,n}".  */
2450	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2451		 : ((token->type == CHARACTER && token->opr.c == ',')
2452		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2453	}
2454      if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2455	{
2456	  /* Invalid sequence.  */
2457	  if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2458	    {
2459	      if (token->type == END_OF_RE)
2460		*err = REG_EBRACE;
2461	      else
2462		*err = REG_BADBR;
2463
2464	      return NULL;
2465	    }
2466
2467	  /* If the syntax bit is set, rollback.  */
2468	  re_string_set_index (regexp, start_idx);
2469	  *token = start_token;
2470	  token->type = CHARACTER;
2471	  /* mb_partial and word_char bits should be already initialized by
2472	     peek_token.  */
2473	  return elem;
2474	}
2475
2476      if (BE (end != REG_MISSING && start > end, 0))
2477	{
2478	  /* First number greater than second.  */
2479	  *err = REG_BADBR;
2480	  return NULL;
2481	}
2482    }
2483  else
2484    {
2485      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2486      end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2487    }
2488
2489  fetch_token (token, regexp, syntax);
2490
2491  if (BE (elem == NULL, 0))
2492    return NULL;
2493  if (BE (start == 0 && end == 0, 0))
2494    {
2495      postorder (elem, free_tree, NULL);
2496      return NULL;
2497    }
2498
2499  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2500  if (BE (start > 0, 0))
2501    {
2502      tree = elem;
2503      for (i = 2; i <= start; ++i)
2504	{
2505	  elem = duplicate_tree (elem, dfa);
2506	  tree = create_tree (dfa, tree, elem, CONCAT);
2507	  if (BE (elem == NULL || tree == NULL, 0))
2508	    goto parse_dup_op_espace;
2509	}
2510
2511      if (start == end)
2512	return tree;
2513
2514      /* Duplicate ELEM before it is marked optional.  */
2515      elem = duplicate_tree (elem, dfa);
2516      old_tree = tree;
2517    }
2518  else
2519    old_tree = NULL;
2520
2521  if (elem->token.type == SUBEXP)
2522    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2523
2524  tree = create_tree (dfa, elem, NULL,
2525		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2526  if (BE (tree == NULL, 0))
2527    goto parse_dup_op_espace;
2528
2529  /* This loop is actually executed only when end != REG_MISSING,
2530     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2531     already created the start+1-th copy.  */
2532  if ((Idx) -1 < 0 || end != REG_MISSING)
2533    for (i = start + 2; i <= end; ++i)
2534      {
2535	elem = duplicate_tree (elem, dfa);
2536	tree = create_tree (dfa, tree, elem, CONCAT);
2537	if (BE (elem == NULL || tree == NULL, 0))
2538	  goto parse_dup_op_espace;
2539
2540	tree = create_tree (dfa, tree, NULL, OP_ALT);
2541	if (BE (tree == NULL, 0))
2542	  goto parse_dup_op_espace;
2543      }
2544
2545  if (old_tree)
2546    tree = create_tree (dfa, old_tree, tree, CONCAT);
2547
2548  return tree;
2549
2550 parse_dup_op_espace:
2551  *err = REG_ESPACE;
2552  return NULL;
2553}
2554
2555/* Size of the names for collating symbol/equivalence_class/character_class.
2556   I'm not sure, but maybe enough.  */
2557#define BRACKET_NAME_BUF_SIZE 32
2558
2559#ifndef _LIBC
2560  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2561     Build the range expression which starts from START_ELEM, and ends
2562     at END_ELEM.  The result are written to MBCSET and SBCSET.
2563     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2564     mbcset->range_ends, is a pointer argument sinse we may
2565     update it.  */
2566
2567static reg_errcode_t
2568build_range_exp (bitset sbcset,
2569# ifdef RE_ENABLE_I18N
2570		 re_charset_t *mbcset, Idx *range_alloc,
2571# endif
2572		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2573{
2574  unsigned int start_ch, end_ch;
2575  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2576  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2577	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2578	  0))
2579    return REG_ERANGE;
2580
2581  /* We can handle no multi character collating elements without libc
2582     support.  */
2583  if (BE ((start_elem->type == COLL_SYM
2584	   && strlen ((char *) start_elem->opr.name) > 1)
2585	  || (end_elem->type == COLL_SYM
2586	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2587    return REG_ECOLLATE;
2588
2589# ifdef RE_ENABLE_I18N
2590  {
2591    wchar_t wc;
2592    wint_t start_wc, end_wc;
2593    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2594
2595    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2596		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2597		   : 0));
2598    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2599	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2600		 : 0));
2601    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2602		? __btowc (start_ch) : start_elem->opr.wch);
2603    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2604	      ? __btowc (end_ch) : end_elem->opr.wch);
2605    if (start_wc == WEOF || end_wc == WEOF)
2606      return REG_ECOLLATE;
2607    cmp_buf[0] = start_wc;
2608    cmp_buf[4] = end_wc;
2609    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2610      return REG_ERANGE;
2611
2612    /* Got valid collation sequence values, add them as a new entry.
2613       However, for !_LIBC we have no collation elements: if the
2614       character set is single byte, the single byte character set
2615       that we build below suffices.  parse_bracket_exp passes
2616       no MBCSET if dfa->mb_cur_max == 1.  */
2617    if (mbcset)
2618      {
2619        /* Check the space of the arrays.  */
2620        if (BE (*range_alloc == mbcset->nranges, 0))
2621          {
2622	    /* There is not enough space, need realloc.  */
2623	    wchar_t *new_array_start, *new_array_end;
2624	    Idx new_nranges;
2625
2626	    new_nranges = mbcset->nranges;
2627	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2628	       are NULL if *range_alloc == 0.  */
2629	    new_array_start = re_x2realloc (mbcset->range_starts, wchar_t,
2630					    &new_nranges);
2631	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2632				        new_nranges);
2633
2634	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2635	      return REG_ESPACE;
2636
2637	    mbcset->range_starts = new_array_start;
2638	    mbcset->range_ends = new_array_end;
2639	    *range_alloc = new_nranges;
2640          }
2641
2642        mbcset->range_starts[mbcset->nranges] = start_wc;
2643        mbcset->range_ends[mbcset->nranges++] = end_wc;
2644      }
2645
2646    /* Build the table for single byte characters.  */
2647    for (wc = 0; wc < SBC_MAX; ++wc)
2648      {
2649	cmp_buf[2] = wc;
2650	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2651	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2652	  bitset_set (sbcset, wc);
2653      }
2654  }
2655# else /* not RE_ENABLE_I18N */
2656  {
2657    unsigned int ch;
2658    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2659		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2660		   : 0));
2661    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2662	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2663		 : 0));
2664    if (start_ch > end_ch)
2665      return REG_ERANGE;
2666    /* Build the table for single byte characters.  */
2667    for (ch = 0; ch < SBC_MAX; ++ch)
2668      if (start_ch <= ch  && ch <= end_ch)
2669	bitset_set (sbcset, ch);
2670  }
2671# endif /* not RE_ENABLE_I18N */
2672  return REG_NOERROR;
2673}
2674#endif /* not _LIBC */
2675
2676#ifndef _LIBC
2677/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2678   Build the collating element which is represented by NAME.
2679   The result are written to MBCSET and SBCSET.
2680   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2681   pointer argument since we may update it.  */
2682
2683static reg_errcode_t
2684build_collating_symbol (bitset sbcset,
2685# ifdef RE_ENABLE_I18N
2686			re_charset_t *mbcset, Idx *coll_sym_alloc,
2687# endif
2688			const unsigned char *name)
2689{
2690  size_t name_len = strlen ((const char *) name);
2691  if (BE (name_len != 1, 0))
2692    return REG_ECOLLATE;
2693  else
2694    {
2695      bitset_set (sbcset, name[0]);
2696      return REG_NOERROR;
2697    }
2698}
2699#endif /* not _LIBC */
2700
2701/* This function parse bracket expression like "[abc]", "[a-c]",
2702   "[[.a-a.]]" etc.  */
2703
2704static bin_tree_t *
2705parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2706		   reg_syntax_t syntax, reg_errcode_t *err)
2707{
2708#ifdef _LIBC
2709  const unsigned char *collseqmb;
2710  const char *collseqwc;
2711  uint32_t nrules;
2712  int32_t table_size;
2713  const int32_t *symb_table;
2714  const unsigned char *extra;
2715
2716  /* Local function for parse_bracket_exp used in _LIBC environement.
2717     Seek the collating symbol entry correspondings to NAME.
2718     Return the index of the symbol in the SYMB_TABLE.  */
2719
2720  auto inline int32_t
2721  __attribute ((always_inline))
2722  seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2723    {
2724      int32_t hash = elem_hash ((const char *) name, name_len);
2725      int32_t elem = hash % table_size;
2726      int32_t second = hash % (table_size - 2);
2727      while (symb_table[2 * elem] != 0)
2728	{
2729	  /* First compare the hashing value.  */
2730	  if (symb_table[2 * elem] == hash
2731	      /* Compare the length of the name.  */
2732	      && name_len == extra[symb_table[2 * elem + 1]]
2733	      /* Compare the name.  */
2734	      && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2735			 name_len) == 0)
2736	    {
2737	      /* Yep, this is the entry.  */
2738	      break;
2739	    }
2740
2741	  /* Next entry.  */
2742	  elem += second;
2743	}
2744      return elem;
2745    }
2746
2747  /* Local function for parse_bracket_exp used in _LIBC environement.
2748     Look up the collation sequence value of BR_ELEM.
2749     Return the value if succeeded, UINT_MAX otherwise.  */
2750
2751  auto inline unsigned int
2752  __attribute ((always_inline))
2753  lookup_collation_sequence_value (bracket_elem_t *br_elem)
2754    {
2755      if (br_elem->type == SB_CHAR)
2756	{
2757	  /*
2758	  if (MB_CUR_MAX == 1)
2759	  */
2760	  if (nrules == 0)
2761	    return collseqmb[br_elem->opr.ch];
2762	  else
2763	    {
2764	      wint_t wc = __btowc (br_elem->opr.ch);
2765	      return __collseq_table_lookup (collseqwc, wc);
2766	    }
2767	}
2768      else if (br_elem->type == MB_CHAR)
2769	{
2770	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2771	}
2772      else if (br_elem->type == COLL_SYM)
2773	{
2774	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2775	  if (nrules != 0)
2776	    {
2777	      int32_t elem, idx;
2778	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2779						  sym_name_len);
2780	      if (symb_table[2 * elem] != 0)
2781		{
2782		  /* We found the entry.  */
2783		  idx = symb_table[2 * elem + 1];
2784		  /* Skip the name of collating element name.  */
2785		  idx += 1 + extra[idx];
2786		  /* Skip the byte sequence of the collating element.  */
2787		  idx += 1 + extra[idx];
2788		  /* Adjust for the alignment.  */
2789		  idx = (idx + 3) & ~3;
2790		  /* Skip the multibyte collation sequence value.  */
2791		  idx += sizeof (unsigned int);
2792		  /* Skip the wide char sequence of the collating element.  */
2793		  idx += sizeof (unsigned int) *
2794		    (1 + *(unsigned int *) (extra + idx));
2795		  /* Return the collation sequence value.  */
2796		  return *(unsigned int *) (extra + idx);
2797		}
2798	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2799		{
2800		  /* No valid character.  Match it as a single byte
2801		     character.  */
2802		  return collseqmb[br_elem->opr.name[0]];
2803		}
2804	    }
2805	  else if (sym_name_len == 1)
2806	    return collseqmb[br_elem->opr.name[0]];
2807	}
2808      return UINT_MAX;
2809    }
2810
2811  /* Local function for parse_bracket_exp used in _LIBC environement.
2812     Build the range expression which starts from START_ELEM, and ends
2813     at END_ELEM.  The result are written to MBCSET and SBCSET.
2814     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2815     mbcset->range_ends, is a pointer argument sinse we may
2816     update it.  */
2817
2818  auto inline reg_errcode_t
2819  __attribute ((always_inline))
2820  build_range_exp (bitset sbcset, re_charset_t *mbcset,
2821		   Idx *range_alloc,
2822		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2823    {
2824      unsigned int ch;
2825      uint32_t start_collseq;
2826      uint32_t end_collseq;
2827
2828      /* Equivalence Classes and Character Classes can't be a range
2829	 start/end.  */
2830      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2831	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2832	      0))
2833	return REG_ERANGE;
2834
2835      start_collseq = lookup_collation_sequence_value (start_elem);
2836      end_collseq = lookup_collation_sequence_value (end_elem);
2837      /* Check start/end collation sequence values.  */
2838      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2839	return REG_ECOLLATE;
2840      if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2841	return REG_ERANGE;
2842
2843      /* Got valid collation sequence values, add them as a new entry.
2844	 However, if we have no collation elements, and the character set
2845	 is single byte, the single byte character set that we
2846	 build below suffices. */
2847      if (nrules > 0 || dfa->mb_cur_max > 1)
2848	{
2849          /* Check the space of the arrays.  */
2850          if (BE (*range_alloc == mbcset->nranges, 0))
2851	    {
2852	      /* There is not enough space, need realloc.  */
2853	      uint32_t *new_array_start;
2854	      uint32_t *new_array_end;
2855	      Idx new_nranges;
2856
2857	      new_nranges = mbcset->nranges;
2858	      new_array_start = re_x2realloc (mbcset->range_starts, uint32_t,
2859					      &new_nranges);
2860	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2861				          new_nranges);
2862
2863	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2864	        return REG_ESPACE;
2865
2866	      mbcset->range_starts = new_array_start;
2867	      mbcset->range_ends = new_array_end;
2868	      *range_alloc = new_nranges;
2869	    }
2870
2871          mbcset->range_starts[mbcset->nranges] = start_collseq;
2872          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2873	}
2874
2875      /* Build the table for single byte characters.  */
2876      for (ch = 0; ch < SBC_MAX; ch++)
2877	{
2878	  uint32_t ch_collseq;
2879	  /*
2880	  if (MB_CUR_MAX == 1)
2881	  */
2882	  if (nrules == 0)
2883	    ch_collseq = collseqmb[ch];
2884	  else
2885	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2886	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2887	    bitset_set (sbcset, ch);
2888	}
2889      return REG_NOERROR;
2890    }
2891
2892  /* Local function for parse_bracket_exp used in _LIBC environement.
2893     Build the collating element which is represented by NAME.
2894     The result are written to MBCSET and SBCSET.
2895     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2896     pointer argument sinse we may update it.  */
2897
2898  auto inline reg_errcode_t
2899  __attribute ((always_inline))
2900  build_collating_symbol (bitset sbcset, re_charset_t *mbcset,
2901			  Idx *coll_sym_alloc, const unsigned char *name)
2902    {
2903      int32_t elem, idx;
2904      size_t name_len = strlen ((const char *) name);
2905      if (nrules != 0)
2906	{
2907	  elem = seek_collating_symbol_entry (name, name_len);
2908	  if (symb_table[2 * elem] != 0)
2909	    {
2910	      /* We found the entry.  */
2911	      idx = symb_table[2 * elem + 1];
2912	      /* Skip the name of collating element name.  */
2913	      idx += 1 + extra[idx];
2914	    }
2915	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2916	    {
2917	      /* No valid character, treat it as a normal
2918		 character.  */
2919	      bitset_set (sbcset, name[0]);
2920	      return REG_NOERROR;
2921	    }
2922	  else
2923	    return REG_ECOLLATE;
2924
2925	  /* Got valid collation sequence, add it as a new entry.  */
2926	  /* Check the space of the arrays.  */
2927	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2928	    {
2929	      /* Not enough, realloc it.  */
2930	      Idx new_coll_sym_alloc = mbcset->ncoll_syms;
2931	      /* Use realloc since mbcset->coll_syms is NULL
2932		 if *alloc == 0.  */
2933	      int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t,
2934						     &new_coll_sym_alloc);
2935	      if (BE (new_coll_syms == NULL, 0))
2936		return REG_ESPACE;
2937	      mbcset->coll_syms = new_coll_syms;
2938	      *coll_sym_alloc = new_coll_sym_alloc;
2939	    }
2940	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2941	  return REG_NOERROR;
2942	}
2943      else
2944	{
2945	  if (BE (name_len != 1, 0))
2946	    return REG_ECOLLATE;
2947	  else
2948	    {
2949	      bitset_set (sbcset, name[0]);
2950	      return REG_NOERROR;
2951	    }
2952	}
2953    }
2954#endif
2955
2956  re_token_t br_token;
2957  re_bitset_ptr_t sbcset;
2958#ifdef RE_ENABLE_I18N
2959  re_charset_t *mbcset;
2960  Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2961  Idx equiv_class_alloc = 0, char_class_alloc = 0;
2962#endif /* not RE_ENABLE_I18N */
2963  bool non_match = false;
2964  bin_tree_t *work_tree;
2965  int token_len;
2966  bool first_round = true;
2967#ifdef _LIBC
2968  collseqmb = (const unsigned char *)
2969    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2970  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2971  if (nrules)
2972    {
2973      /*
2974      if (MB_CUR_MAX > 1)
2975      */
2976	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2977      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2978      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2979						  _NL_COLLATE_SYMB_TABLEMB);
2980      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2981						   _NL_COLLATE_SYMB_EXTRAMB);
2982    }
2983#endif
2984  sbcset = re_calloc (bitset_word, BITSET_WORDS);
2985#ifdef RE_ENABLE_I18N
2986  mbcset = re_calloc (re_charset_t, 1);
2987#endif /* RE_ENABLE_I18N */
2988#ifdef RE_ENABLE_I18N
2989  if (BE (sbcset == NULL || mbcset == NULL, 0))
2990#else
2991  if (BE (sbcset == NULL, 0))
2992#endif /* RE_ENABLE_I18N */
2993    {
2994      *err = REG_ESPACE;
2995      return NULL;
2996    }
2997
2998  token_len = peek_token_bracket (token, regexp, syntax);
2999  if (BE (token->type == END_OF_RE, 0))
3000    {
3001      *err = REG_BADPAT;
3002      goto parse_bracket_exp_free_return;
3003    }
3004  if (token->type == OP_NON_MATCH_LIST)
3005    {
3006#ifdef RE_ENABLE_I18N
3007      mbcset->non_match = 1;
3008#endif /* not RE_ENABLE_I18N */
3009      non_match = true;
3010      if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3011	bitset_set (sbcset, '\0');
3012      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3013      token_len = peek_token_bracket (token, regexp, syntax);
3014      if (BE (token->type == END_OF_RE, 0))
3015	{
3016	  *err = REG_BADPAT;
3017	  goto parse_bracket_exp_free_return;
3018	}
3019    }
3020
3021  /* We treat the first ']' as a normal character.  */
3022  if (token->type == OP_CLOSE_BRACKET)
3023    token->type = CHARACTER;
3024
3025  while (1)
3026    {
3027      bracket_elem_t start_elem, end_elem;
3028      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3029      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3030      reg_errcode_t ret;
3031      int token_len2 = 0;
3032      bool is_range_exp = false;
3033      re_token_t token2;
3034
3035      start_elem.opr.name = start_name_buf;
3036      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3037				   syntax, first_round);
3038      if (BE (ret != REG_NOERROR, 0))
3039	{
3040	  *err = ret;
3041	  goto parse_bracket_exp_free_return;
3042	}
3043      first_round = false;
3044
3045      /* Get information about the next token.  We need it in any case.  */
3046      token_len = peek_token_bracket (token, regexp, syntax);
3047
3048      /* Do not check for ranges if we know they are not allowed.  */
3049      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3050	{
3051	  if (BE (token->type == END_OF_RE, 0))
3052	    {
3053	      *err = REG_EBRACK;
3054	      goto parse_bracket_exp_free_return;
3055	    }
3056	  if (token->type == OP_CHARSET_RANGE)
3057	    {
3058	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3059	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3060	      if (BE (token2.type == END_OF_RE, 0))
3061		{
3062		  *err = REG_EBRACK;
3063		  goto parse_bracket_exp_free_return;
3064		}
3065	      if (token2.type == OP_CLOSE_BRACKET)
3066		{
3067		  /* We treat the last '-' as a normal character.  */
3068		  re_string_skip_bytes (regexp, -token_len);
3069		  token->type = CHARACTER;
3070		}
3071	      else
3072		is_range_exp = true;
3073	    }
3074	}
3075
3076      if (is_range_exp == true)
3077	{
3078	  end_elem.opr.name = end_name_buf;
3079	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3080				       dfa, syntax, true);
3081	  if (BE (ret != REG_NOERROR, 0))
3082	    {
3083	      *err = ret;
3084	      goto parse_bracket_exp_free_return;
3085	    }
3086
3087	  token_len = peek_token_bracket (token, regexp, syntax);
3088
3089#ifdef _LIBC
3090	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3091				  &start_elem, &end_elem);
3092#else
3093# ifdef RE_ENABLE_I18N
3094	  *err = build_range_exp (sbcset,
3095				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3096				  &range_alloc, &start_elem, &end_elem);
3097# else
3098	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3099# endif
3100#endif /* RE_ENABLE_I18N */
3101	  if (BE (*err != REG_NOERROR, 0))
3102	    goto parse_bracket_exp_free_return;
3103	}
3104      else
3105	{
3106	  switch (start_elem.type)
3107	    {
3108	    case SB_CHAR:
3109	      bitset_set (sbcset, start_elem.opr.ch);
3110	      break;
3111#ifdef RE_ENABLE_I18N
3112	    case MB_CHAR:
3113	      /* Check whether the array has enough space.  */
3114	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3115		{
3116		  wchar_t *new_mbchars;
3117		  /* Not enough, realloc it.  */
3118		  mbchar_alloc = mbcset->nmbchars;
3119		  /* Use realloc since array is NULL if *alloc == 0.  */
3120		  new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t,
3121					      &mbchar_alloc);
3122		  if (BE (new_mbchars == NULL, 0))
3123		    goto parse_bracket_exp_espace;
3124		  mbcset->mbchars = new_mbchars;
3125		}
3126	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3127	      break;
3128#endif /* RE_ENABLE_I18N */
3129	    case EQUIV_CLASS:
3130	      *err = build_equiv_class (sbcset,
3131#ifdef RE_ENABLE_I18N
3132					mbcset, &equiv_class_alloc,
3133#endif /* RE_ENABLE_I18N */
3134					start_elem.opr.name);
3135	      if (BE (*err != REG_NOERROR, 0))
3136		goto parse_bracket_exp_free_return;
3137	      break;
3138	    case COLL_SYM:
3139	      *err = build_collating_symbol (sbcset,
3140#ifdef RE_ENABLE_I18N
3141					     mbcset, &coll_sym_alloc,
3142#endif /* RE_ENABLE_I18N */
3143					     start_elem.opr.name);
3144	      if (BE (*err != REG_NOERROR, 0))
3145		goto parse_bracket_exp_free_return;
3146	      break;
3147	    case CHAR_CLASS:
3148	      *err = build_charclass (regexp->trans, sbcset,
3149#ifdef RE_ENABLE_I18N
3150				      mbcset, &char_class_alloc,
3151#endif /* RE_ENABLE_I18N */
3152				      start_elem.opr.name, syntax);
3153	      if (BE (*err != REG_NOERROR, 0))
3154	       goto parse_bracket_exp_free_return;
3155	      break;
3156	    default:
3157	      assert (0);
3158	      break;
3159	    }
3160	}
3161      if (BE (token->type == END_OF_RE, 0))
3162	{
3163	  *err = REG_EBRACK;
3164	  goto parse_bracket_exp_free_return;
3165	}
3166      if (token->type == OP_CLOSE_BRACKET)
3167	break;
3168    }
3169
3170  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3171
3172  /* If it is non-matching list.  */
3173  if (non_match)
3174    bitset_not (sbcset);
3175
3176#ifdef RE_ENABLE_I18N
3177  /* Ensure only single byte characters are set.  */
3178  if (dfa->mb_cur_max > 1)
3179    bitset_mask (sbcset, dfa->sb_char);
3180
3181  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3182      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3183						     || mbcset->non_match)))
3184    {
3185      bin_tree_t *mbc_tree;
3186      int sbc_idx;
3187      /* Build a tree for complex bracket.  */
3188      dfa->has_mb_node = 1;
3189      br_token.type = COMPLEX_BRACKET;
3190      br_token.opr.mbcset = mbcset;
3191      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3192      if (BE (mbc_tree == NULL, 0))
3193	goto parse_bracket_exp_espace;
3194      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3195	if (sbcset[sbc_idx])
3196	  break;
3197      /* If there are no bits set in sbcset, there is no point
3198	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3199      if (sbc_idx < BITSET_WORDS)
3200	{
3201          /* Build a tree for simple bracket.  */
3202          br_token.type = SIMPLE_BRACKET;
3203          br_token.opr.sbcset = sbcset;
3204          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3205          if (BE (work_tree == NULL, 0))
3206            goto parse_bracket_exp_espace;
3207
3208          /* Then join them by ALT node.  */
3209          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3210          if (BE (work_tree == NULL, 0))
3211            goto parse_bracket_exp_espace;
3212	}
3213      else
3214	{
3215	  re_free (sbcset);
3216	  work_tree = mbc_tree;
3217	}
3218    }
3219  else
3220#endif /* not RE_ENABLE_I18N */
3221    {
3222#ifdef RE_ENABLE_I18N
3223      free_charset (mbcset);
3224#endif
3225      /* Build a tree for simple bracket.  */
3226      br_token.type = SIMPLE_BRACKET;
3227      br_token.opr.sbcset = sbcset;
3228      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3229      if (BE (work_tree == NULL, 0))
3230        goto parse_bracket_exp_espace;
3231    }
3232  return work_tree;
3233
3234 parse_bracket_exp_espace:
3235  *err = REG_ESPACE;
3236 parse_bracket_exp_free_return:
3237  re_free (sbcset);
3238#ifdef RE_ENABLE_I18N
3239  free_charset (mbcset);
3240#endif /* RE_ENABLE_I18N */
3241  return NULL;
3242}
3243
3244/* Parse an element in the bracket expression.  */
3245
3246static reg_errcode_t
3247parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3248		       re_token_t *token, int token_len, re_dfa_t *dfa,
3249		       reg_syntax_t syntax, bool accept_hyphen)
3250{
3251#ifdef RE_ENABLE_I18N
3252  int cur_char_size;
3253  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3254  if (cur_char_size > 1)
3255    {
3256      elem->type = MB_CHAR;
3257      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3258      re_string_skip_bytes (regexp, cur_char_size);
3259      return REG_NOERROR;
3260    }
3261#endif /* RE_ENABLE_I18N */
3262  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3263  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3264      || token->type == OP_OPEN_EQUIV_CLASS)
3265    return parse_bracket_symbol (elem, regexp, token);
3266  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3267    {
3268      /* A '-' must only appear as anything but a range indicator before
3269	 the closing bracket.  Everything else is an error.  */
3270      re_token_t token2;
3271      (void) peek_token_bracket (&token2, regexp, syntax);
3272      if (token2.type != OP_CLOSE_BRACKET)
3273	/* The actual error value is not standardized since this whole
3274	   case is undefined.  But ERANGE makes good sense.  */
3275	return REG_ERANGE;
3276    }
3277  elem->type = SB_CHAR;
3278  elem->opr.ch = token->opr.c;
3279  return REG_NOERROR;
3280}
3281
3282/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3283   such as [:<character_class>:], [.<collating_element>.], and
3284   [=<equivalent_class>=].  */
3285
3286static reg_errcode_t
3287parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3288		      re_token_t *token)
3289{
3290  unsigned char ch, delim = token->opr.c;
3291  int i = 0;
3292  if (re_string_eoi(regexp))
3293    return REG_EBRACK;
3294  for (;; ++i)
3295    {
3296      if (i >= BRACKET_NAME_BUF_SIZE)
3297	return REG_EBRACK;
3298      if (token->type == OP_OPEN_CHAR_CLASS)
3299	ch = re_string_fetch_byte_case (regexp);
3300      else
3301	ch = re_string_fetch_byte (regexp);
3302      if (re_string_eoi(regexp))
3303	return REG_EBRACK;
3304      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3305	break;
3306      elem->opr.name[i] = ch;
3307    }
3308  re_string_skip_bytes (regexp, 1);
3309  elem->opr.name[i] = '\0';
3310  switch (token->type)
3311    {
3312    case OP_OPEN_COLL_ELEM:
3313      elem->type = COLL_SYM;
3314      break;
3315    case OP_OPEN_EQUIV_CLASS:
3316      elem->type = EQUIV_CLASS;
3317      break;
3318    case OP_OPEN_CHAR_CLASS:
3319      elem->type = CHAR_CLASS;
3320      break;
3321    default:
3322      break;
3323    }
3324  return REG_NOERROR;
3325}
3326
3327  /* Helper function for parse_bracket_exp.
3328     Build the equivalence class which is represented by NAME.
3329     The result are written to MBCSET and SBCSET.
3330     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3331     is a pointer argument sinse we may update it.  */
3332
3333static reg_errcode_t
3334build_equiv_class (bitset sbcset,
3335#ifdef RE_ENABLE_I18N
3336		   re_charset_t *mbcset, Idx *equiv_class_alloc,
3337#endif
3338		   const unsigned char *name)
3339{
3340#if defined _LIBC
3341  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3342  if (nrules != 0)
3343    {
3344      const int32_t *table, *indirect;
3345      const unsigned char *weights, *extra, *cp;
3346      unsigned char char_buf[2];
3347      int32_t idx1, idx2;
3348      unsigned int ch;
3349      size_t len;
3350      /* This #include defines a local function!  */
3351# include <locale/weight.h>
3352      /* Calculate the index for equivalence class.  */
3353      cp = name;
3354      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3355      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3356					       _NL_COLLATE_WEIGHTMB);
3357      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3358						   _NL_COLLATE_EXTRAMB);
3359      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3360						_NL_COLLATE_INDIRECTMB);
3361      idx1 = findidx (&cp);
3362      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3363	/* This isn't a valid character.  */
3364	return REG_ECOLLATE;
3365
3366      /* Build single byte matcing table for this equivalence class.  */
3367      char_buf[1] = (unsigned char) '\0';
3368      len = weights[idx1];
3369      for (ch = 0; ch < SBC_MAX; ++ch)
3370	{
3371	  char_buf[0] = ch;
3372	  cp = char_buf;
3373	  idx2 = findidx (&cp);
3374/*
3375	  idx2 = table[ch];
3376*/
3377	  if (idx2 == 0)
3378	    /* This isn't a valid character.  */
3379	    continue;
3380	  if (len == weights[idx2])
3381	    {
3382	      int cnt = 0;
3383	      while (cnt <= len &&
3384		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3385		++cnt;
3386
3387	      if (cnt > len)
3388		bitset_set (sbcset, ch);
3389	    }
3390	}
3391      /* Check whether the array has enough space.  */
3392      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3393	{
3394	  /* Not enough, realloc it.  */
3395	  Idx new_equiv_class_alloc = mbcset->nequiv_classes;
3396	  /* Use realloc since the array is NULL if *alloc == 0.  */
3397	  int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes,
3398						     int32_t,
3399						     &new_equiv_class_alloc);
3400	  if (BE (new_equiv_classes == NULL, 0))
3401	    return REG_ESPACE;
3402	  mbcset->equiv_classes = new_equiv_classes;
3403	  *equiv_class_alloc = new_equiv_class_alloc;
3404	}
3405      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3406    }
3407  else
3408#endif /* _LIBC */
3409    {
3410      if (BE (strlen ((const char *) name) != 1, 0))
3411	return REG_ECOLLATE;
3412      bitset_set (sbcset, *name);
3413    }
3414  return REG_NOERROR;
3415}
3416
3417  /* Helper function for parse_bracket_exp.
3418     Build the character class which is represented by NAME.
3419     The result are written to MBCSET and SBCSET.
3420     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3421     is a pointer argument sinse we may update it.  */
3422
3423static reg_errcode_t
3424build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset,
3425#ifdef RE_ENABLE_I18N
3426		 re_charset_t *mbcset, Idx *char_class_alloc,
3427#endif
3428		 const unsigned char *class_name, reg_syntax_t syntax)
3429{
3430  int i;
3431  const char *name = (const char *) class_name;
3432
3433  /* In case of REG_ICASE "upper" and "lower" match the both of
3434     upper and lower cases.  */
3435  if ((syntax & REG_IGNORE_CASE)
3436      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3437    name = "alpha";
3438
3439#ifdef RE_ENABLE_I18N
3440  /* Check the space of the arrays.  */
3441  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3442    {
3443      /* Not enough, realloc it.  */
3444      Idx new_char_class_alloc = mbcset->nchar_classes;
3445      /* Use realloc since array is NULL if *alloc == 0.  */
3446      wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t,
3447						 &new_char_class_alloc);
3448      if (BE (new_char_classes == NULL, 0))
3449	return REG_ESPACE;
3450      mbcset->char_classes = new_char_classes;
3451      *char_class_alloc = new_char_class_alloc;
3452    }
3453  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3454#endif /* RE_ENABLE_I18N */
3455
3456#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3457    for (i = 0; i < SBC_MAX; ++i)		\
3458      {						\
3459	if (ctype_func (i))			\
3460	  {					\
3461	    int ch = trans ? trans[i] : i;	\
3462	    bitset_set (sbcset, ch);		\
3463	  }					\
3464      }
3465
3466  if (strcmp (name, "alnum") == 0)
3467    BUILD_CHARCLASS_LOOP (isalnum)
3468  else if (strcmp (name, "cntrl") == 0)
3469    BUILD_CHARCLASS_LOOP (iscntrl)
3470  else if (strcmp (name, "lower") == 0)
3471    BUILD_CHARCLASS_LOOP (islower)
3472  else if (strcmp (name, "space") == 0)
3473    BUILD_CHARCLASS_LOOP (isspace)
3474  else if (strcmp (name, "alpha") == 0)
3475    BUILD_CHARCLASS_LOOP (isalpha)
3476  else if (strcmp (name, "digit") == 0)
3477    BUILD_CHARCLASS_LOOP (isdigit)
3478  else if (strcmp (name, "print") == 0)
3479    BUILD_CHARCLASS_LOOP (isprint)
3480  else if (strcmp (name, "upper") == 0)
3481    BUILD_CHARCLASS_LOOP (isupper)
3482  else if (strcmp (name, "blank") == 0)
3483    BUILD_CHARCLASS_LOOP (isblank)
3484  else if (strcmp (name, "graph") == 0)
3485    BUILD_CHARCLASS_LOOP (isgraph)
3486  else if (strcmp (name, "punct") == 0)
3487    BUILD_CHARCLASS_LOOP (ispunct)
3488  else if (strcmp (name, "xdigit") == 0)
3489    BUILD_CHARCLASS_LOOP (isxdigit)
3490  else
3491    return REG_ECTYPE;
3492
3493  return REG_NOERROR;
3494}
3495
3496static bin_tree_t *
3497build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3498		    const unsigned char *class_name,
3499		    const unsigned char *extra,
3500		    bool non_match, reg_errcode_t *err)
3501{
3502  re_bitset_ptr_t sbcset;
3503#ifdef RE_ENABLE_I18N
3504  re_charset_t *mbcset;
3505  Idx alloc = 0;
3506#endif /* not RE_ENABLE_I18N */
3507  reg_errcode_t ret;
3508  re_token_t br_token;
3509  bin_tree_t *tree;
3510
3511  sbcset = re_calloc (bitset_word, BITSET_WORDS);
3512#ifdef RE_ENABLE_I18N
3513  mbcset = re_calloc (re_charset_t, 1);
3514#endif /* RE_ENABLE_I18N */
3515
3516#ifdef RE_ENABLE_I18N
3517  if (BE (sbcset == NULL || mbcset == NULL, 0))
3518#else /* not RE_ENABLE_I18N */
3519  if (BE (sbcset == NULL, 0))
3520#endif /* not RE_ENABLE_I18N */
3521    {
3522      *err = REG_ESPACE;
3523      return NULL;
3524    }
3525
3526  if (non_match)
3527    {
3528#ifdef RE_ENABLE_I18N
3529      /*
3530      if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3531	bitset_set(cset->sbcset, '\0');
3532      */
3533      mbcset->non_match = 1;
3534#endif /* not RE_ENABLE_I18N */
3535    }
3536
3537  /* We don't care the syntax in this case.  */
3538  ret = build_charclass (trans, sbcset,
3539#ifdef RE_ENABLE_I18N
3540			 mbcset, &alloc,
3541#endif /* RE_ENABLE_I18N */
3542			 class_name, 0);
3543
3544  if (BE (ret != REG_NOERROR, 0))
3545    {
3546      re_free (sbcset);
3547#ifdef RE_ENABLE_I18N
3548      free_charset (mbcset);
3549#endif /* RE_ENABLE_I18N */
3550      *err = ret;
3551      return NULL;
3552    }
3553  /* \w match '_' also.  */
3554  for (; *extra; extra++)
3555    bitset_set (sbcset, *extra);
3556
3557  /* If it is non-matching list.  */
3558  if (non_match)
3559    bitset_not (sbcset);
3560
3561#ifdef RE_ENABLE_I18N
3562  /* Ensure only single byte characters are set.  */
3563  if (dfa->mb_cur_max > 1)
3564    bitset_mask (sbcset, dfa->sb_char);
3565#endif
3566
3567  /* Build a tree for simple bracket.  */
3568  br_token.type = SIMPLE_BRACKET;
3569  br_token.opr.sbcset = sbcset;
3570  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3571  if (BE (tree == NULL, 0))
3572    goto build_word_op_espace;
3573
3574#ifdef RE_ENABLE_I18N
3575  if (dfa->mb_cur_max > 1)
3576    {
3577      bin_tree_t *mbc_tree;
3578      /* Build a tree for complex bracket.  */
3579      br_token.type = COMPLEX_BRACKET;
3580      br_token.opr.mbcset = mbcset;
3581      dfa->has_mb_node = 1;
3582      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3583      if (BE (mbc_tree == NULL, 0))
3584	goto build_word_op_espace;
3585      /* Then join them by ALT node.  */
3586      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3587      if (BE (mbc_tree != NULL, 1))
3588	return tree;
3589    }
3590  else
3591    {
3592      free_charset (mbcset);
3593      return tree;
3594    }
3595#else /* not RE_ENABLE_I18N */
3596  return tree;
3597#endif /* not RE_ENABLE_I18N */
3598
3599 build_word_op_espace:
3600  re_free (sbcset);
3601#ifdef RE_ENABLE_I18N
3602  free_charset (mbcset);
3603#endif /* RE_ENABLE_I18N */
3604  *err = REG_ESPACE;
3605  return NULL;
3606}
3607
3608/* This is intended for the expressions like "a{1,3}".
3609   Fetch a number from `input', and return the number.
3610   Return REG_MISSING if the number field is empty like "{,1}".
3611   Return REG_ERROR if an error occurred.  */
3612
3613static Idx
3614fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3615{
3616  Idx num = REG_MISSING;
3617  unsigned char c;
3618  while (1)
3619    {
3620      fetch_token (token, input, syntax);
3621      c = token->opr.c;
3622      if (BE (token->type == END_OF_RE, 0))
3623	return REG_ERROR;
3624      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3625	break;
3626      num = ((token->type != CHARACTER || c < '0' || '9' < c
3627	      || num == REG_ERROR)
3628	     ? REG_ERROR
3629	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3630      num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3631    }
3632  return num;
3633}
3634
3635#ifdef RE_ENABLE_I18N
3636static void
3637free_charset (re_charset_t *cset)
3638{
3639  re_free (cset->mbchars);
3640# ifdef _LIBC
3641  re_free (cset->coll_syms);
3642  re_free (cset->equiv_classes);
3643  re_free (cset->range_starts);
3644  re_free (cset->range_ends);
3645# endif
3646  re_free (cset->char_classes);
3647  re_free (cset);
3648}
3649#endif /* RE_ENABLE_I18N */
3650
3651/* Functions for binary tree operation.  */
3652
3653/* Create a tree node.  */
3654
3655static bin_tree_t *
3656create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3657	     re_token_type_t type)
3658{
3659  re_token_t t;
3660  t.type = type;
3661  return create_token_tree (dfa, left, right, &t);
3662}
3663
3664static bin_tree_t *
3665create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3666		   const re_token_t *token)
3667{
3668  bin_tree_t *tree;
3669  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3670    {
3671      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3672
3673      if (storage == NULL)
3674	return NULL;
3675      storage->next = dfa->str_tree_storage;
3676      dfa->str_tree_storage = storage;
3677      dfa->str_tree_storage_idx = 0;
3678    }
3679  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3680
3681  tree->parent = NULL;
3682  tree->left = left;
3683  tree->right = right;
3684  tree->token = *token;
3685  tree->token.duplicated = 0;
3686  tree->token.opt_subexp = 0;
3687  tree->first = NULL;
3688  tree->next = NULL;
3689  tree->node_idx = REG_MISSING;
3690
3691  if (left != NULL)
3692    left->parent = tree;
3693  if (right != NULL)
3694    right->parent = tree;
3695  return tree;
3696}
3697
3698/* Mark the tree SRC as an optional subexpression.
3699   To be called from preorder or postorder.  */
3700
3701static reg_errcode_t
3702mark_opt_subexp (void *extra, bin_tree_t *node)
3703{
3704  Idx idx = (Idx) (long) extra;
3705  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3706    node->token.opt_subexp = 1;
3707
3708  return REG_NOERROR;
3709}
3710
3711/* Free the allocated memory inside NODE. */
3712
3713static void
3714free_token (re_token_t *node)
3715{
3716#ifdef RE_ENABLE_I18N
3717  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3718    free_charset (node->opr.mbcset);
3719  else
3720#endif /* RE_ENABLE_I18N */
3721    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3722      re_free (node->opr.sbcset);
3723}
3724
3725/* Worker function for tree walking.  Free the allocated memory inside NODE
3726   and its children. */
3727
3728static reg_errcode_t
3729free_tree (void *extra, bin_tree_t *node)
3730{
3731  free_token (&node->token);
3732  return REG_NOERROR;
3733}
3734
3735
3736/* Duplicate the node SRC, and return new node.  This is a preorder
3737   visit similar to the one implemented by the generic visitor, but
3738   we need more infrastructure to maintain two parallel trees --- so,
3739   it's easier to duplicate.  */
3740
3741static bin_tree_t *
3742duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3743{
3744  const bin_tree_t *node;
3745  bin_tree_t *dup_root;
3746  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3747
3748  for (node = root; ; )
3749    {
3750      /* Create a new tree and link it back to the current parent.  */
3751      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3752      if (*p_new == NULL)
3753	return NULL;
3754      (*p_new)->parent = dup_node;
3755      (*p_new)->token.duplicated = 1;
3756      dup_node = *p_new;
3757
3758      /* Go to the left node, or up and to the right.  */
3759      if (node->left)
3760	{
3761	  node = node->left;
3762	  p_new = &dup_node->left;
3763	}
3764      else
3765	{
3766	  const bin_tree_t *prev = NULL;
3767	  while (node->right == prev || node->right == NULL)
3768	    {
3769	      prev = node;
3770	      node = node->parent;
3771	      dup_node = dup_node->parent;
3772	      if (!node)
3773	        return dup_root;
3774	    }
3775	  node = node->right;
3776	  p_new = &dup_node->right;
3777	}
3778    }
3779}
3780