1146040Stjr/* Extended regular expression matching and search library.
2146040Stjr   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3146040Stjr   This file is part of the GNU C Library.
4146040Stjr   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5146040Stjr
6146040Stjr   The GNU C Library is free software; you can redistribute it and/or
7146040Stjr   modify it under the terms of the GNU Lesser General Public
8146040Stjr   License as published by the Free Software Foundation; either
9146040Stjr   version 2.1 of the License, or (at your option) any later version.
10146040Stjr
11146040Stjr   The GNU C Library is distributed in the hope that it will be useful,
12146040Stjr   but WITHOUT ANY WARRANTY; without even the implied warranty of
13146040Stjr   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14146040Stjr   Lesser General Public License for more details.
15146040Stjr
16146040Stjr   You should have received a copy of the GNU Lesser General Public
17146040Stjr   License along with the GNU C Library; if not, write to the Free
18146040Stjr   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19146040Stjr   02111-1307 USA.  */
20146040Stjr
21146040Stjrstatic reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22146040Stjr					  int length, reg_syntax_t syntax);
23146040Stjrstatic void re_compile_fastmap_iter (regex_t *bufp,
24146040Stjr				     const re_dfastate_t *init_state,
25146040Stjr				     char *fastmap);
26146040Stjrstatic reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27146040Stjrstatic void init_word_char (re_dfa_t *dfa);
28146040Stjr#ifdef RE_ENABLE_I18N
29146040Stjrstatic void free_charset (re_charset_t *cset);
30146040Stjr#endif /* RE_ENABLE_I18N */
31146040Stjrstatic void free_workarea_compile (regex_t *preg);
32146040Stjrstatic reg_errcode_t create_initial_state (re_dfa_t *dfa);
33146040Stjr#ifdef RE_ENABLE_I18N
34146040Stjrstatic void optimize_utf8 (re_dfa_t *dfa);
35146040Stjr#endif
36146040Stjrstatic reg_errcode_t analyze (regex_t *preg);
37146040Stjrstatic reg_errcode_t create_initial_state (re_dfa_t *dfa);
38146040Stjrstatic reg_errcode_t preorder (bin_tree_t *root,
39146040Stjr			       reg_errcode_t (fn (void *, bin_tree_t *)),
40146040Stjr			       void *extra);
41146040Stjrstatic reg_errcode_t postorder (bin_tree_t *root,
42146040Stjr				reg_errcode_t (fn (void *, bin_tree_t *)),
43146040Stjr				void *extra);
44146040Stjrstatic reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45146040Stjrstatic reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46146040Stjrstatic bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47146040Stjr				 bin_tree_t *node);
48146040Stjrstatic reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49146040Stjrstatic reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50146040Stjrstatic reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51146040Stjrstatic reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
52146040Stjr					     int top_clone_node, int root_node,
53146040Stjr					     unsigned int constraint);
54146040Stjrstatic reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
55146040Stjr				     unsigned int constraint);
56146040Stjrstatic int search_duplicated_node (re_dfa_t *dfa, int org_node,
57146040Stjr				   unsigned int constraint);
58146040Stjrstatic reg_errcode_t calc_eclosure (re_dfa_t *dfa);
59146040Stjrstatic reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
60146040Stjr					 int node, int root);
61146040Stjrstatic reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
62146040Stjrstatic int fetch_number (re_string_t *input, re_token_t *token,
63146040Stjr			 reg_syntax_t syntax);
64146040Stjrstatic void fetch_token (re_token_t *result, re_string_t *input,
65146040Stjr			 reg_syntax_t syntax);
66146040Stjrstatic int peek_token (re_token_t *token, re_string_t *input,
67146040Stjr			reg_syntax_t syntax);
68146040Stjrstatic int peek_token_bracket (re_token_t *token, re_string_t *input,
69146040Stjr			       reg_syntax_t syntax);
70146040Stjrstatic bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
71146040Stjr			  reg_syntax_t syntax, reg_errcode_t *err);
72146040Stjrstatic bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
73146040Stjr				  re_token_t *token, reg_syntax_t syntax,
74146040Stjr				  int nest, reg_errcode_t *err);
75146040Stjrstatic bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
76146040Stjr				 re_token_t *token, reg_syntax_t syntax,
77146040Stjr				 int nest, reg_errcode_t *err);
78146040Stjrstatic bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
79146040Stjr				     re_token_t *token, reg_syntax_t syntax,
80146040Stjr				     int nest, reg_errcode_t *err);
81146040Stjrstatic bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
82146040Stjr				  re_token_t *token, reg_syntax_t syntax,
83146040Stjr				  int nest, reg_errcode_t *err);
84146040Stjrstatic bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
85146040Stjr				 re_dfa_t *dfa, re_token_t *token,
86146040Stjr				 reg_syntax_t syntax, reg_errcode_t *err);
87146040Stjrstatic bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
88146040Stjr				      re_token_t *token, reg_syntax_t syntax,
89146040Stjr				      reg_errcode_t *err);
90146040Stjrstatic reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
91146040Stjr					    re_string_t *regexp,
92146040Stjr					    re_token_t *token, int token_len,
93146040Stjr					    re_dfa_t *dfa,
94146040Stjr					    reg_syntax_t syntax,
95146040Stjr					    int accept_hyphen);
96146040Stjrstatic reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97146040Stjr					  re_string_t *regexp,
98146040Stjr					  re_token_t *token);
99146040Stjr#ifndef _LIBC
100146040Stjr# ifdef RE_ENABLE_I18N
101146040Stjrstatic reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
102146040Stjr				      re_charset_t *mbcset, int *range_alloc,
103146040Stjr				      bracket_elem_t *start_elem,
104146040Stjr				      bracket_elem_t *end_elem);
105146040Stjrstatic reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
106146040Stjr					     re_charset_t *mbcset,
107146040Stjr					     int *coll_sym_alloc,
108146040Stjr					     const unsigned char *name);
109146040Stjr# else /* not RE_ENABLE_I18N */
110146040Stjrstatic reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
111146040Stjr				      bracket_elem_t *start_elem,
112146040Stjr				      bracket_elem_t *end_elem);
113146040Stjrstatic reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
114146040Stjr					     const unsigned char *name);
115146040Stjr# endif /* not RE_ENABLE_I18N */
116146040Stjr#endif /* not _LIBC */
117146040Stjr#ifdef RE_ENABLE_I18N
118146040Stjrstatic reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119146040Stjr					re_charset_t *mbcset,
120146040Stjr					int *equiv_class_alloc,
121146040Stjr					const unsigned char *name);
122146040Stjrstatic reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
123146040Stjr				      re_bitset_ptr_t sbcset,
124146040Stjr				      re_charset_t *mbcset,
125146040Stjr				      int *char_class_alloc,
126146040Stjr				      const unsigned char *class_name,
127146040Stjr				      reg_syntax_t syntax);
128146040Stjr#else  /* not RE_ENABLE_I18N */
129146040Stjrstatic reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
130146040Stjr					const unsigned char *name);
131146040Stjrstatic reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
132146040Stjr				      re_bitset_ptr_t sbcset,
133146040Stjr				      const unsigned char *class_name,
134146040Stjr				      reg_syntax_t syntax);
135146040Stjr#endif /* not RE_ENABLE_I18N */
136146040Stjrstatic bin_tree_t *build_charclass_op (re_dfa_t *dfa,
137146040Stjr				       unsigned RE_TRANSLATE_TYPE trans,
138146040Stjr				       const unsigned char *class_name,
139146040Stjr				       const unsigned char *extra,
140146040Stjr				       int non_match, reg_errcode_t *err);
141146040Stjrstatic bin_tree_t *create_tree (re_dfa_t *dfa,
142146040Stjr				bin_tree_t *left, bin_tree_t *right,
143146040Stjr				re_token_type_t type);
144146040Stjrstatic bin_tree_t *create_token_tree (re_dfa_t *dfa,
145146040Stjr				      bin_tree_t *left, bin_tree_t *right,
146146040Stjr				      const re_token_t *token);
147146040Stjrstatic bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
148146040Stjrstatic void free_token (re_token_t *node);
149146040Stjrstatic reg_errcode_t free_tree (void *extra, bin_tree_t *node);
150146040Stjrstatic reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
151146040Stjr
152146040Stjr/* This table gives an error message for each of the error codes listed
153146040Stjr   in regex.h.  Obviously the order here has to be same as there.
154146040Stjr   POSIX doesn't require that we do anything for REG_NOERROR,
155146040Stjr   but why not be nice?  */
156146040Stjr
157146040Stjrconst char __re_error_msgid[] attribute_hidden =
158146040Stjr  {
159146040Stjr#define REG_NOERROR_IDX	0
160146040Stjr    gettext_noop ("Success")	/* REG_NOERROR */
161146040Stjr    "\0"
162146040Stjr#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
163146040Stjr    gettext_noop ("No match")	/* REG_NOMATCH */
164146040Stjr    "\0"
165146040Stjr#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
166146040Stjr    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
167146040Stjr    "\0"
168146040Stjr#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
169146040Stjr    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
170146040Stjr    "\0"
171146040Stjr#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
172146040Stjr    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
173146040Stjr    "\0"
174146040Stjr#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
175146040Stjr    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
176146040Stjr    "\0"
177146040Stjr#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
178146040Stjr    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
179146040Stjr    "\0"
180146040Stjr#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
181146040Stjr    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
182146040Stjr    "\0"
183146040Stjr#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
184146040Stjr    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
185146040Stjr    "\0"
186146040Stjr#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
187146040Stjr    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
188146040Stjr    "\0"
189146040Stjr#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
190146040Stjr    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
191146040Stjr    "\0"
192146040Stjr#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
193146040Stjr    gettext_noop ("Invalid range end")	/* REG_ERANGE */
194146040Stjr    "\0"
195146040Stjr#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
196146040Stjr    gettext_noop ("Memory exhausted") /* REG_ESPACE */
197146040Stjr    "\0"
198146040Stjr#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
199146040Stjr    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
200146040Stjr    "\0"
201146040Stjr#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
202146040Stjr    gettext_noop ("Premature end of regular expression") /* REG_EEND */
203146040Stjr    "\0"
204146040Stjr#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
205146040Stjr    gettext_noop ("Regular expression too big") /* REG_ESIZE */
206146040Stjr    "\0"
207146040Stjr#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
208146040Stjr    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209146040Stjr  };
210146040Stjr
211146040Stjrconst size_t __re_error_msgid_idx[] attribute_hidden =
212146040Stjr  {
213146040Stjr    REG_NOERROR_IDX,
214146040Stjr    REG_NOMATCH_IDX,
215146040Stjr    REG_BADPAT_IDX,
216146040Stjr    REG_ECOLLATE_IDX,
217146040Stjr    REG_ECTYPE_IDX,
218146040Stjr    REG_EESCAPE_IDX,
219146040Stjr    REG_ESUBREG_IDX,
220146040Stjr    REG_EBRACK_IDX,
221146040Stjr    REG_EPAREN_IDX,
222146040Stjr    REG_EBRACE_IDX,
223146040Stjr    REG_BADBR_IDX,
224146040Stjr    REG_ERANGE_IDX,
225146040Stjr    REG_ESPACE_IDX,
226146040Stjr    REG_BADRPT_IDX,
227146040Stjr    REG_EEND_IDX,
228146040Stjr    REG_ESIZE_IDX,
229146040Stjr    REG_ERPAREN_IDX
230146040Stjr  };
231146040Stjr
232146040Stjr/* Entry points for GNU code.  */
233146040Stjr
234146040Stjr/* re_compile_pattern is the GNU regular expression compiler: it
235146040Stjr   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
236146040Stjr   Returns 0 if the pattern was valid, otherwise an error string.
237146040Stjr
238146040Stjr   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
239146040Stjr   are set in BUFP on entry.  */
240146040Stjr
241146040Stjrconst char *
242146040Stjrre_compile_pattern (pattern, length, bufp)
243146040Stjr    const char *pattern;
244146040Stjr    size_t length;
245146040Stjr    struct re_pattern_buffer *bufp;
246146040Stjr{
247146040Stjr  reg_errcode_t ret;
248146040Stjr
249146040Stjr  /* And GNU code determines whether or not to get register information
250146040Stjr     by passing null for the REGS argument to re_match, etc., not by
251146040Stjr     setting no_sub, unless RE_NO_SUB is set.  */
252146040Stjr  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
253146040Stjr
254146040Stjr  /* Match anchors at newline.  */
255146040Stjr  bufp->newline_anchor = 1;
256146040Stjr
257146040Stjr  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
258146040Stjr
259146040Stjr  if (!ret)
260146040Stjr    return NULL;
261146040Stjr  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
262146040Stjr}
263146040Stjr#ifdef _LIBC
264146040Stjrweak_alias (__re_compile_pattern, re_compile_pattern)
265146040Stjr#endif
266146040Stjr
267146040Stjr/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
268146040Stjr   also be assigned to arbitrarily: each pattern buffer stores its own
269146040Stjr   syntax, so it can be changed between regex compilations.  */
270146040Stjr/* This has no initializer because initialized variables in Emacs
271146040Stjr   become read-only after dumping.  */
272146040Stjrreg_syntax_t re_syntax_options;
273146040Stjr
274146040Stjr
275146040Stjr/* Specify the precise syntax of regexps for compilation.  This provides
276146040Stjr   for compatibility for various utilities which historically have
277146040Stjr   different, incompatible syntaxes.
278146040Stjr
279146040Stjr   The argument SYNTAX is a bit mask comprised of the various bits
280146040Stjr   defined in regex.h.  We return the old syntax.  */
281146040Stjr
282146040Stjrreg_syntax_t
283146040Stjrre_set_syntax (syntax)
284146040Stjr    reg_syntax_t syntax;
285146040Stjr{
286146040Stjr  reg_syntax_t ret = re_syntax_options;
287146040Stjr
288146040Stjr  re_syntax_options = syntax;
289146040Stjr  return ret;
290146040Stjr}
291146040Stjr#ifdef _LIBC
292146040Stjrweak_alias (__re_set_syntax, re_set_syntax)
293146040Stjr#endif
294146040Stjr
295146040Stjrint
296146040Stjrre_compile_fastmap (bufp)
297146040Stjr    struct re_pattern_buffer *bufp;
298146040Stjr{
299146040Stjr  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
300146040Stjr  char *fastmap = bufp->fastmap;
301146040Stjr
302146040Stjr  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
303146040Stjr  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
304146040Stjr  if (dfa->init_state != dfa->init_state_word)
305146040Stjr    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
306146040Stjr  if (dfa->init_state != dfa->init_state_nl)
307146040Stjr    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
308146040Stjr  if (dfa->init_state != dfa->init_state_begbuf)
309146040Stjr    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
310146040Stjr  bufp->fastmap_accurate = 1;
311146040Stjr  return 0;
312146040Stjr}
313146040Stjr#ifdef _LIBC
314146040Stjrweak_alias (__re_compile_fastmap, re_compile_fastmap)
315146040Stjr#endif
316146040Stjr
317146040Stjrstatic inline void
318146040Stjr__attribute ((always_inline))
319146040Stjrre_set_fastmap (char *fastmap, int icase, int ch)
320146040Stjr{
321146040Stjr  fastmap[ch] = 1;
322146040Stjr  if (icase)
323146040Stjr    fastmap[tolower (ch)] = 1;
324146040Stjr}
325146040Stjr
326146040Stjr/* Helper function for re_compile_fastmap.
327146040Stjr   Compile fastmap for the initial_state INIT_STATE.  */
328146040Stjr
329146040Stjrstatic void
330146040Stjrre_compile_fastmap_iter (bufp, init_state, fastmap)
331146040Stjr     regex_t *bufp;
332146040Stjr     const re_dfastate_t *init_state;
333146040Stjr     char *fastmap;
334146040Stjr{
335146040Stjr  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
336146040Stjr  int node_cnt;
337146040Stjr  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
338146040Stjr  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
339146040Stjr    {
340146040Stjr      int node = init_state->nodes.elems[node_cnt];
341146040Stjr      re_token_type_t type = dfa->nodes[node].type;
342146040Stjr
343146040Stjr      if (type == CHARACTER)
344146040Stjr	{
345146040Stjr	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
346146040Stjr#ifdef RE_ENABLE_I18N
347146040Stjr	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
348146040Stjr	    {
349146040Stjr	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
350146040Stjr	      wchar_t wc;
351146040Stjr	      mbstate_t state;
352146040Stjr
353146040Stjr	      p = buf;
354146040Stjr	      *p++ = dfa->nodes[node].opr.c;
355146040Stjr	      while (++node < dfa->nodes_len
356146040Stjr		     &&	dfa->nodes[node].type == CHARACTER
357146040Stjr		     && dfa->nodes[node].mb_partial)
358146040Stjr		*p++ = dfa->nodes[node].opr.c;
359146040Stjr	      memset (&state, 0, sizeof (state));
360146040Stjr	      if (mbrtowc (&wc, (const char *) buf, p - buf,
361146040Stjr			   &state) == p - buf
362146040Stjr		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
363146040Stjr		      != (size_t) -1))
364146040Stjr		re_set_fastmap (fastmap, 0, buf[0]);
365146040Stjr	    }
366146040Stjr#endif
367146040Stjr	}
368146040Stjr      else if (type == SIMPLE_BRACKET)
369146040Stjr	{
370146040Stjr	  int i, j, ch;
371146040Stjr	  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
372146040Stjr	    for (j = 0; j < UINT_BITS; ++j, ++ch)
373146040Stjr	      if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
374146040Stjr		re_set_fastmap (fastmap, icase, ch);
375146040Stjr	}
376146040Stjr#ifdef RE_ENABLE_I18N
377146040Stjr      else if (type == COMPLEX_BRACKET)
378146040Stjr	{
379146040Stjr	  int i;
380146040Stjr	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
381146040Stjr	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
382146040Stjr	      || cset->nranges || cset->nchar_classes)
383146040Stjr	    {
384146040Stjr# ifdef _LIBC
385146040Stjr	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
386146040Stjr		{
387146040Stjr		  /* In this case we want to catch the bytes which are
388146040Stjr		     the first byte of any collation elements.
389146040Stjr		     e.g. In da_DK, we want to catch 'a' since "aa"
390146040Stjr			  is a valid collation element, and don't catch
391146040Stjr			  'b' since 'b' is the only collation element
392146040Stjr			  which starts from 'b'.  */
393146040Stjr		  int j, ch;
394146040Stjr		  const int32_t *table = (const int32_t *)
395146040Stjr		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
396146040Stjr		  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
397146040Stjr		    for (j = 0; j < UINT_BITS; ++j, ++ch)
398146040Stjr		      if (table[ch] < 0)
399146040Stjr			re_set_fastmap (fastmap, icase, ch);
400146040Stjr		}
401146040Stjr# else
402146040Stjr	      if (dfa->mb_cur_max > 1)
403146040Stjr		for (i = 0; i < SBC_MAX; ++i)
404146040Stjr		  if (__btowc (i) == WEOF)
405146040Stjr		    re_set_fastmap (fastmap, icase, i);
406146040Stjr# endif /* not _LIBC */
407146040Stjr	    }
408146040Stjr	  for (i = 0; i < cset->nmbchars; ++i)
409146040Stjr	    {
410146040Stjr	      char buf[256];
411146040Stjr	      mbstate_t state;
412146040Stjr	      memset (&state, '\0', sizeof (state));
413146040Stjr	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
414146040Stjr		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
415146040Stjr	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
416146040Stjr		{
417146040Stjr		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
418146040Stjr		      != (size_t) -1)
419146040Stjr		    re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
420146040Stjr		}
421146040Stjr	    }
422146040Stjr	}
423146040Stjr#endif /* RE_ENABLE_I18N */
424146040Stjr      else if (type == OP_PERIOD
425146040Stjr#ifdef RE_ENABLE_I18N
426146040Stjr	       || type == OP_UTF8_PERIOD
427146040Stjr#endif /* RE_ENABLE_I18N */
428146040Stjr	       || type == END_OF_RE)
429146040Stjr	{
430146040Stjr	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
431146040Stjr	  if (type == END_OF_RE)
432146040Stjr	    bufp->can_be_null = 1;
433146040Stjr	  return;
434146040Stjr	}
435146040Stjr    }
436146040Stjr}
437146040Stjr
438146040Stjr/* Entry point for POSIX code.  */
439146040Stjr/* regcomp takes a regular expression as a string and compiles it.
440146040Stjr
441146040Stjr   PREG is a regex_t *.  We do not expect any fields to be initialized,
442146040Stjr   since POSIX says we shouldn't.  Thus, we set
443146040Stjr
444146040Stjr     `buffer' to the compiled pattern;
445146040Stjr     `used' to the length of the compiled pattern;
446146040Stjr     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
447146040Stjr       REG_EXTENDED bit in CFLAGS is set; otherwise, to
448146040Stjr       RE_SYNTAX_POSIX_BASIC;
449146040Stjr     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
450146040Stjr     `fastmap' to an allocated space for the fastmap;
451146040Stjr     `fastmap_accurate' to zero;
452146040Stjr     `re_nsub' to the number of subexpressions in PATTERN.
453146040Stjr
454146040Stjr   PATTERN is the address of the pattern string.
455146040Stjr
456146040Stjr   CFLAGS is a series of bits which affect compilation.
457146040Stjr
458146040Stjr     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
459146040Stjr     use POSIX basic syntax.
460146040Stjr
461146040Stjr     If REG_NEWLINE is set, then . and [^...] don't match newline.
462146040Stjr     Also, regexec will try a match beginning after every newline.
463146040Stjr
464146040Stjr     If REG_ICASE is set, then we considers upper- and lowercase
465146040Stjr     versions of letters to be equivalent when matching.
466146040Stjr
467146040Stjr     If REG_NOSUB is set, then when PREG is passed to regexec, that
468146040Stjr     routine will report only success or failure, and nothing about the
469146040Stjr     registers.
470146040Stjr
471146040Stjr   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
472146040Stjr   the return codes and their meanings.)  */
473146040Stjr
474146040Stjrint
475146040Stjrregcomp (preg, pattern, cflags)
476146040Stjr    regex_t *__restrict preg;
477146040Stjr    const char *__restrict pattern;
478146040Stjr    int cflags;
479146040Stjr{
480146040Stjr  reg_errcode_t ret;
481146040Stjr  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
482146040Stjr			 : RE_SYNTAX_POSIX_BASIC);
483146040Stjr
484146040Stjr  preg->buffer = NULL;
485146040Stjr  preg->allocated = 0;
486146040Stjr  preg->used = 0;
487146040Stjr
488146040Stjr  /* Try to allocate space for the fastmap.  */
489146040Stjr  preg->fastmap = re_malloc (char, SBC_MAX);
490146040Stjr  if (BE (preg->fastmap == NULL, 0))
491146040Stjr    return REG_ESPACE;
492146040Stjr
493146040Stjr  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494146040Stjr
495146040Stjr  /* If REG_NEWLINE is set, newlines are treated differently.  */
496146040Stjr  if (cflags & REG_NEWLINE)
497146040Stjr    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
498146040Stjr      syntax &= ~RE_DOT_NEWLINE;
499146040Stjr      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
500146040Stjr      /* It also changes the matching behavior.  */
501146040Stjr      preg->newline_anchor = 1;
502146040Stjr    }
503146040Stjr  else
504146040Stjr    preg->newline_anchor = 0;
505146040Stjr  preg->no_sub = !!(cflags & REG_NOSUB);
506146040Stjr  preg->translate = NULL;
507146040Stjr
508146040Stjr  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509146040Stjr
510146040Stjr  /* POSIX doesn't distinguish between an unmatched open-group and an
511146040Stjr     unmatched close-group: both are REG_EPAREN.  */
512146040Stjr  if (ret == REG_ERPAREN)
513146040Stjr    ret = REG_EPAREN;
514146040Stjr
515146040Stjr  /* We have already checked preg->fastmap != NULL.  */
516146040Stjr  if (BE (ret == REG_NOERROR, 1))
517146040Stjr    /* Compute the fastmap now, since regexec cannot modify the pattern
518146040Stjr       buffer.  This function never fails in this implementation.  */
519146040Stjr    (void) re_compile_fastmap (preg);
520146040Stjr  else
521146040Stjr    {
522146040Stjr      /* Some error occurred while compiling the expression.  */
523146040Stjr      re_free (preg->fastmap);
524146040Stjr      preg->fastmap = NULL;
525146040Stjr    }
526146040Stjr
527146040Stjr  return (int) ret;
528146040Stjr}
529146040Stjr#ifdef _LIBC
530146040Stjrweak_alias (__regcomp, regcomp)
531146040Stjr#endif
532146040Stjr
533146040Stjr/* Returns a message corresponding to an error code, ERRCODE, returned
534146040Stjr   from either regcomp or regexec.   We don't use PREG here.  */
535146040Stjr
536146040Stjrsize_t
537146040Stjrregerror (errcode, preg, errbuf, errbuf_size)
538146040Stjr    int errcode;
539146040Stjr    const regex_t *preg;
540146040Stjr    char *errbuf;
541146040Stjr    size_t errbuf_size;
542146040Stjr{
543146040Stjr  const char *msg;
544146040Stjr  size_t msg_size;
545146040Stjr
546146040Stjr  if (BE (errcode < 0
547146040Stjr	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
548146040Stjr			       / sizeof (__re_error_msgid_idx[0])), 0))
549146040Stjr    /* Only error codes returned by the rest of the code should be passed
550146040Stjr       to this routine.  If we are given anything else, or if other regex
551146040Stjr       code generates an invalid error code, then the program has a bug.
552146040Stjr       Dump core so we can fix it.  */
553146040Stjr    abort ();
554146040Stjr
555146040Stjr  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
556146040Stjr
557146040Stjr  msg_size = strlen (msg) + 1; /* Includes the null.  */
558146040Stjr
559146040Stjr  if (BE (errbuf_size != 0, 1))
560146040Stjr    {
561146040Stjr      if (BE (msg_size > errbuf_size, 0))
562146040Stjr	{
563146040Stjr#if defined HAVE_MEMPCPY || defined _LIBC
564146040Stjr	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
565146040Stjr#else
566146040Stjr	  memcpy (errbuf, msg, errbuf_size - 1);
567146040Stjr	  errbuf[errbuf_size - 1] = 0;
568146040Stjr#endif
569146040Stjr	}
570146040Stjr      else
571146040Stjr	memcpy (errbuf, msg, msg_size);
572146040Stjr    }
573146040Stjr
574146040Stjr  return msg_size;
575146040Stjr}
576146040Stjr#ifdef _LIBC
577146040Stjrweak_alias (__regerror, regerror)
578146040Stjr#endif
579146040Stjr
580146040Stjr
581146040Stjr#ifdef RE_ENABLE_I18N
582146040Stjr/* This static array is used for the map to single-byte characters when
583146040Stjr   UTF-8 is used.  Otherwise we would allocate memory just to initialize
584146040Stjr   it the same all the time.  UTF-8 is the preferred encoding so this is
585146040Stjr   a worthwhile optimization.  */
586146040Stjrstatic const bitset utf8_sb_map =
587146040Stjr{
588146040Stjr  /* Set the first 128 bits.  */
589146040Stjr# if UINT_MAX == 0xffffffff
590146040Stjr  0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
591146040Stjr# else
592146040Stjr#  error "Add case for new unsigned int size"
593146040Stjr# endif
594146040Stjr};
595146040Stjr#endif
596146040Stjr
597146040Stjr
598146040Stjrstatic void
599146040Stjrfree_dfa_content (re_dfa_t *dfa)
600146040Stjr{
601146040Stjr  int i, j;
602146040Stjr
603146040Stjr  if (dfa->nodes)
604146040Stjr    for (i = 0; i < dfa->nodes_len; ++i)
605146040Stjr      free_token (dfa->nodes + i);
606146040Stjr  re_free (dfa->nexts);
607146040Stjr  for (i = 0; i < dfa->nodes_len; ++i)
608146040Stjr    {
609146040Stjr      if (dfa->eclosures != NULL)
610146040Stjr	re_node_set_free (dfa->eclosures + i);
611146040Stjr      if (dfa->inveclosures != NULL)
612146040Stjr	re_node_set_free (dfa->inveclosures + i);
613146040Stjr      if (dfa->edests != NULL)
614146040Stjr	re_node_set_free (dfa->edests + i);
615146040Stjr    }
616146040Stjr  re_free (dfa->edests);
617146040Stjr  re_free (dfa->eclosures);
618146040Stjr  re_free (dfa->inveclosures);
619146040Stjr  re_free (dfa->nodes);
620146040Stjr
621146040Stjr  if (dfa->state_table)
622146040Stjr    for (i = 0; i <= dfa->state_hash_mask; ++i)
623146040Stjr      {
624146040Stjr	struct re_state_table_entry *entry = dfa->state_table + i;
625146040Stjr	for (j = 0; j < entry->num; ++j)
626146040Stjr	  {
627146040Stjr	    re_dfastate_t *state = entry->array[j];
628146040Stjr	    free_state (state);
629146040Stjr	  }
630146040Stjr        re_free (entry->array);
631146040Stjr      }
632146040Stjr  re_free (dfa->state_table);
633146040Stjr#ifdef RE_ENABLE_I18N
634146040Stjr  if (dfa->sb_char != utf8_sb_map)
635146040Stjr    re_free (dfa->sb_char);
636146040Stjr#endif
637146040Stjr  re_free (dfa->subexp_map);
638146040Stjr#ifdef DEBUG
639146040Stjr  re_free (dfa->re_str);
640146040Stjr#endif
641146040Stjr
642146040Stjr  re_free (dfa);
643146040Stjr}
644146040Stjr
645146040Stjr
646146040Stjr/* Free dynamically allocated space used by PREG.  */
647146040Stjr
648146040Stjrvoid
649146040Stjrregfree (preg)
650146040Stjr    regex_t *preg;
651146040Stjr{
652146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
653146040Stjr  if (BE (dfa != NULL, 1))
654146040Stjr    free_dfa_content (dfa);
655146040Stjr  preg->buffer = NULL;
656146040Stjr  preg->allocated = 0;
657146040Stjr
658146040Stjr  re_free (preg->fastmap);
659146040Stjr  preg->fastmap = NULL;
660146040Stjr
661146040Stjr  re_free (preg->translate);
662146040Stjr  preg->translate = NULL;
663146040Stjr}
664146040Stjr#ifdef _LIBC
665146040Stjrweak_alias (__regfree, regfree)
666146040Stjr#endif
667146040Stjr
668146040Stjr/* Entry points compatible with 4.2 BSD regex library.  We don't define
669146040Stjr   them unless specifically requested.  */
670146040Stjr
671146040Stjr#if defined _REGEX_RE_COMP || defined _LIBC
672146040Stjr
673146040Stjr/* BSD has one and only one pattern buffer.  */
674146040Stjrstatic struct re_pattern_buffer re_comp_buf;
675146040Stjr
676146040Stjrchar *
677146040Stjr# ifdef _LIBC
678146040Stjr/* Make these definitions weak in libc, so POSIX programs can redefine
679146040Stjr   these names if they don't use our functions, and still use
680146040Stjr   regcomp/regexec above without link errors.  */
681146040Stjrweak_function
682146040Stjr# endif
683146040Stjrre_comp (s)
684146040Stjr     const char *s;
685146040Stjr{
686146040Stjr  reg_errcode_t ret;
687146040Stjr  char *fastmap;
688146040Stjr
689146040Stjr  if (!s)
690146040Stjr    {
691146040Stjr      if (!re_comp_buf.buffer)
692146040Stjr	return gettext ("No previous regular expression");
693146040Stjr      return 0;
694146040Stjr    }
695146040Stjr
696146040Stjr  if (re_comp_buf.buffer)
697146040Stjr    {
698146040Stjr      fastmap = re_comp_buf.fastmap;
699146040Stjr      re_comp_buf.fastmap = NULL;
700146040Stjr      __regfree (&re_comp_buf);
701146040Stjr      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
702146040Stjr      re_comp_buf.fastmap = fastmap;
703146040Stjr    }
704146040Stjr
705146040Stjr  if (re_comp_buf.fastmap == NULL)
706146040Stjr    {
707146040Stjr      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
708146040Stjr      if (re_comp_buf.fastmap == NULL)
709146040Stjr	return (char *) gettext (__re_error_msgid
710146040Stjr				 + __re_error_msgid_idx[(int) REG_ESPACE]);
711146040Stjr    }
712146040Stjr
713146040Stjr  /* Since `re_exec' always passes NULL for the `regs' argument, we
714146040Stjr     don't need to initialize the pattern buffer fields which affect it.  */
715146040Stjr
716146040Stjr  /* Match anchors at newlines.  */
717146040Stjr  re_comp_buf.newline_anchor = 1;
718146040Stjr
719146040Stjr  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
720146040Stjr
721146040Stjr  if (!ret)
722146040Stjr    return NULL;
723146040Stjr
724146040Stjr  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
725146040Stjr  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
726146040Stjr}
727146040Stjr
728146040Stjr#ifdef _LIBC
729146040Stjrlibc_freeres_fn (free_mem)
730146040Stjr{
731146040Stjr  __regfree (&re_comp_buf);
732146040Stjr}
733146040Stjr#endif
734146040Stjr
735146040Stjr#endif /* _REGEX_RE_COMP */
736146040Stjr
737146040Stjr/* Internal entry point.
738146040Stjr   Compile the regular expression PATTERN, whose length is LENGTH.
739146040Stjr   SYNTAX indicate regular expression's syntax.  */
740146040Stjr
741146040Stjrstatic reg_errcode_t
742146040Stjrre_compile_internal (preg, pattern, length, syntax)
743146040Stjr     regex_t *preg;
744146040Stjr     const char * pattern;
745146040Stjr     int length;
746146040Stjr     reg_syntax_t syntax;
747146040Stjr{
748146040Stjr  reg_errcode_t err = REG_NOERROR;
749146040Stjr  re_dfa_t *dfa;
750146040Stjr  re_string_t regexp;
751146040Stjr
752146040Stjr  /* Initialize the pattern buffer.  */
753146040Stjr  preg->fastmap_accurate = 0;
754146040Stjr  preg->syntax = syntax;
755146040Stjr  preg->not_bol = preg->not_eol = 0;
756146040Stjr  preg->used = 0;
757146040Stjr  preg->re_nsub = 0;
758146040Stjr  preg->can_be_null = 0;
759146040Stjr  preg->regs_allocated = REGS_UNALLOCATED;
760146040Stjr
761146040Stjr  /* Initialize the dfa.  */
762146040Stjr  dfa = (re_dfa_t *) preg->buffer;
763146040Stjr  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
764146040Stjr    {
765146040Stjr      /* If zero allocated, but buffer is non-null, try to realloc
766146040Stjr	 enough space.  This loses if buffer's address is bogus, but
767146040Stjr	 that is the user's responsibility.  If ->buffer is NULL this
768146040Stjr	 is a simple allocation.  */
769146040Stjr      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
770146040Stjr      if (dfa == NULL)
771146040Stjr	return REG_ESPACE;
772146040Stjr      preg->allocated = sizeof (re_dfa_t);
773146040Stjr      preg->buffer = (unsigned char *) dfa;
774146040Stjr    }
775146040Stjr  preg->used = sizeof (re_dfa_t);
776146040Stjr
777146040Stjr  err = init_dfa (dfa, length);
778146040Stjr  if (BE (err != REG_NOERROR, 0))
779146040Stjr    {
780146040Stjr      free_dfa_content (dfa);
781146040Stjr      preg->buffer = NULL;
782146040Stjr      preg->allocated = 0;
783146040Stjr      return err;
784146040Stjr    }
785146040Stjr#ifdef DEBUG
786146040Stjr  dfa->re_str = re_malloc (char, length + 1);
787146040Stjr  strncpy (dfa->re_str, pattern, length + 1);
788146040Stjr#endif
789146040Stjr
790146040Stjr  err = re_string_construct (&regexp, pattern, length, preg->translate,
791146040Stjr			     syntax & RE_ICASE, dfa);
792146040Stjr  if (BE (err != REG_NOERROR, 0))
793146040Stjr    {
794146040Stjr    re_compile_internal_free_return:
795146040Stjr      free_workarea_compile (preg);
796146040Stjr      re_string_destruct (&regexp);
797146040Stjr      free_dfa_content (dfa);
798146040Stjr      preg->buffer = NULL;
799146040Stjr      preg->allocated = 0;
800146040Stjr      return err;
801146040Stjr    }
802146040Stjr
803146040Stjr  /* Parse the regular expression, and build a structure tree.  */
804146040Stjr  preg->re_nsub = 0;
805146040Stjr  dfa->str_tree = parse (&regexp, preg, syntax, &err);
806146040Stjr  if (BE (dfa->str_tree == NULL, 0))
807146040Stjr    goto re_compile_internal_free_return;
808146040Stjr
809146040Stjr  /* Analyze the tree and create the nfa.  */
810146040Stjr  err = analyze (preg);
811146040Stjr  if (BE (err != REG_NOERROR, 0))
812146040Stjr    goto re_compile_internal_free_return;
813146040Stjr
814146040Stjr#ifdef RE_ENABLE_I18N
815146040Stjr  /* If possible, do searching in single byte encoding to speed things up.  */
816146040Stjr  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
817146040Stjr    optimize_utf8 (dfa);
818146040Stjr#endif
819146040Stjr
820146040Stjr  /* Then create the initial state of the dfa.  */
821146040Stjr  err = create_initial_state (dfa);
822146040Stjr
823146040Stjr  /* Release work areas.  */
824146040Stjr  free_workarea_compile (preg);
825146040Stjr  re_string_destruct (&regexp);
826146040Stjr
827146040Stjr  if (BE (err != REG_NOERROR, 0))
828146040Stjr    {
829146040Stjr      free_dfa_content (dfa);
830146040Stjr      preg->buffer = NULL;
831146040Stjr      preg->allocated = 0;
832146040Stjr    }
833146040Stjr
834146040Stjr  return err;
835146040Stjr}
836146040Stjr
837146040Stjr/* Initialize DFA.  We use the length of the regular expression PAT_LEN
838146040Stjr   as the initial length of some arrays.  */
839146040Stjr
840146040Stjrstatic reg_errcode_t
841146040Stjrinit_dfa (dfa, pat_len)
842146040Stjr     re_dfa_t *dfa;
843146040Stjr     int pat_len;
844146040Stjr{
845146040Stjr  int table_size;
846146040Stjr#ifndef _LIBC
847146040Stjr  char *codeset_name;
848146040Stjr#endif
849146040Stjr
850146040Stjr  memset (dfa, '\0', sizeof (re_dfa_t));
851146040Stjr
852146040Stjr  /* Force allocation of str_tree_storage the first time.  */
853146040Stjr  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854146040Stjr
855146040Stjr  dfa->nodes_alloc = pat_len + 1;
856146040Stjr  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
857146040Stjr
858146040Stjr  dfa->states_alloc = pat_len + 1;
859146040Stjr
860146040Stjr  /*  table_size = 2 ^ ceil(log pat_len) */
861146040Stjr  for (table_size = 1; table_size > 0; table_size <<= 1)
862146040Stjr    if (table_size > pat_len)
863146040Stjr      break;
864146040Stjr
865146040Stjr  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
866146040Stjr  dfa->state_hash_mask = table_size - 1;
867146040Stjr
868146040Stjr  dfa->mb_cur_max = MB_CUR_MAX;
869146040Stjr#ifdef _LIBC
870146040Stjr  if (dfa->mb_cur_max == 6
871146040Stjr      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
872146040Stjr    dfa->is_utf8 = 1;
873146040Stjr  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
874146040Stjr		       != 0);
875146040Stjr#else
876146040Stjr# ifdef HAVE_LANGINFO_CODESET
877146040Stjr  codeset_name = nl_langinfo (CODESET);
878146040Stjr# else
879146040Stjr  codeset_name = getenv ("LC_ALL");
880146040Stjr  if (codeset_name == NULL || codeset_name[0] == '\0')
881146040Stjr    codeset_name = getenv ("LC_CTYPE");
882146040Stjr  if (codeset_name == NULL || codeset_name[0] == '\0')
883146040Stjr    codeset_name = getenv ("LANG");
884146040Stjr  if (codeset_name == NULL)
885146040Stjr    codeset_name = "";
886146040Stjr  else if (strchr (codeset_name, '.') !=  NULL)
887146040Stjr    codeset_name = strchr (codeset_name, '.') + 1;
888146040Stjr# endif
889146040Stjr
890146040Stjr  if (strcasecmp (codeset_name, "UTF-8") == 0
891146040Stjr      || strcasecmp (codeset_name, "UTF8") == 0)
892146040Stjr    dfa->is_utf8 = 1;
893146040Stjr
894146040Stjr  /* We check exhaustively in the loop below if this charset is a
895146040Stjr     superset of ASCII.  */
896146040Stjr  dfa->map_notascii = 0;
897146040Stjr#endif
898146040Stjr
899146040Stjr#ifdef RE_ENABLE_I18N
900146040Stjr  if (dfa->mb_cur_max > 1)
901146040Stjr    {
902146040Stjr      if (dfa->is_utf8)
903146040Stjr	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
904146040Stjr      else
905146040Stjr	{
906146040Stjr	  int i, j, ch;
907146040Stjr
908146040Stjr	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
909146040Stjr	  if (BE (dfa->sb_char == NULL, 0))
910146040Stjr	    return REG_ESPACE;
911146040Stjr
912146040Stjr	  /* Clear all bits by, then set those corresponding to single
913146040Stjr	     byte chars.  */
914146040Stjr	  bitset_empty (dfa->sb_char);
915146040Stjr
916146040Stjr	  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
917146040Stjr	    for (j = 0; j < UINT_BITS; ++j, ++ch)
918146040Stjr	      {
919146040Stjr		wchar_t wch = __btowc (ch);
920146040Stjr		if (wch != WEOF)
921146040Stjr		  dfa->sb_char[i] |= 1 << j;
922146040Stjr# ifndef _LIBC
923146040Stjr		if (isascii (ch) && wch != (wchar_t) ch)
924146040Stjr		  dfa->map_notascii = 1;
925146040Stjr# endif
926146040Stjr	      }
927146040Stjr	}
928146040Stjr    }
929146040Stjr#endif
930146040Stjr
931146040Stjr  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
932146040Stjr    return REG_ESPACE;
933146040Stjr  return REG_NOERROR;
934146040Stjr}
935146040Stjr
936146040Stjr/* Initialize WORD_CHAR table, which indicate which character is
937146040Stjr   "word".  In this case "word" means that it is the word construction
938146040Stjr   character used by some operators like "\<", "\>", etc.  */
939146040Stjr
940146040Stjrstatic void
941146040Stjrinit_word_char (dfa)
942146040Stjr     re_dfa_t *dfa;
943146040Stjr{
944146040Stjr  int i, j, ch;
945146040Stjr  dfa->word_ops_used = 1;
946146040Stjr  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
947146040Stjr    for (j = 0; j < UINT_BITS; ++j, ++ch)
948146040Stjr      if (isalnum (ch) || ch == '_')
949146040Stjr	dfa->word_char[i] |= 1 << j;
950146040Stjr}
951146040Stjr
952146040Stjr/* Free the work area which are only used while compiling.  */
953146040Stjr
954146040Stjrstatic void
955146040Stjrfree_workarea_compile (preg)
956146040Stjr     regex_t *preg;
957146040Stjr{
958146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
959146040Stjr  bin_tree_storage_t *storage, *next;
960146040Stjr  for (storage = dfa->str_tree_storage; storage; storage = next)
961146040Stjr    {
962146040Stjr      next = storage->next;
963146040Stjr      re_free (storage);
964146040Stjr    }
965146040Stjr  dfa->str_tree_storage = NULL;
966146040Stjr  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
967146040Stjr  dfa->str_tree = NULL;
968146040Stjr  re_free (dfa->org_indices);
969146040Stjr  dfa->org_indices = NULL;
970146040Stjr}
971146040Stjr
972146040Stjr/* Create initial states for all contexts.  */
973146040Stjr
974146040Stjrstatic reg_errcode_t
975146040Stjrcreate_initial_state (dfa)
976146040Stjr     re_dfa_t *dfa;
977146040Stjr{
978146040Stjr  int first, i;
979146040Stjr  reg_errcode_t err;
980146040Stjr  re_node_set init_nodes;
981146040Stjr
982146040Stjr  /* Initial states have the epsilon closure of the node which is
983146040Stjr     the first node of the regular expression.  */
984146040Stjr  first = dfa->str_tree->first->node_idx;
985146040Stjr  dfa->init_node = first;
986146040Stjr  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
987146040Stjr  if (BE (err != REG_NOERROR, 0))
988146040Stjr    return err;
989146040Stjr
990146040Stjr  /* The back-references which are in initial states can epsilon transit,
991146040Stjr     since in this case all of the subexpressions can be null.
992146040Stjr     Then we add epsilon closures of the nodes which are the next nodes of
993146040Stjr     the back-references.  */
994146040Stjr  if (dfa->nbackref > 0)
995146040Stjr    for (i = 0; i < init_nodes.nelem; ++i)
996146040Stjr      {
997146040Stjr	int node_idx = init_nodes.elems[i];
998146040Stjr	re_token_type_t type = dfa->nodes[node_idx].type;
999146040Stjr
1000146040Stjr	int clexp_idx;
1001146040Stjr	if (type != OP_BACK_REF)
1002146040Stjr	  continue;
1003146040Stjr	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1004146040Stjr	  {
1005146040Stjr	    re_token_t *clexp_node;
1006146040Stjr	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1007146040Stjr	    if (clexp_node->type == OP_CLOSE_SUBEXP
1008146040Stjr		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1009146040Stjr	      break;
1010146040Stjr	  }
1011146040Stjr	if (clexp_idx == init_nodes.nelem)
1012146040Stjr	  continue;
1013146040Stjr
1014146040Stjr	if (type == OP_BACK_REF)
1015146040Stjr	  {
1016146040Stjr	    int dest_idx = dfa->edests[node_idx].elems[0];
1017146040Stjr	    if (!re_node_set_contains (&init_nodes, dest_idx))
1018146040Stjr	      {
1019146040Stjr		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1020146040Stjr		i = 0;
1021146040Stjr	      }
1022146040Stjr	  }
1023146040Stjr      }
1024146040Stjr
1025146040Stjr  /* It must be the first time to invoke acquire_state.  */
1026146040Stjr  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1027146040Stjr  /* We don't check ERR here, since the initial state must not be NULL.  */
1028146040Stjr  if (BE (dfa->init_state == NULL, 0))
1029146040Stjr    return err;
1030146040Stjr  if (dfa->init_state->has_constraint)
1031146040Stjr    {
1032146040Stjr      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1033146040Stjr						       CONTEXT_WORD);
1034146040Stjr      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1035146040Stjr						     CONTEXT_NEWLINE);
1036146040Stjr      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1037146040Stjr							 &init_nodes,
1038146040Stjr							 CONTEXT_NEWLINE
1039146040Stjr							 | CONTEXT_BEGBUF);
1040146040Stjr      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1041146040Stjr	      || dfa->init_state_begbuf == NULL, 0))
1042146040Stjr	return err;
1043146040Stjr    }
1044146040Stjr  else
1045146040Stjr    dfa->init_state_word = dfa->init_state_nl
1046146040Stjr      = dfa->init_state_begbuf = dfa->init_state;
1047146040Stjr
1048146040Stjr  re_node_set_free (&init_nodes);
1049146040Stjr  return REG_NOERROR;
1050146040Stjr}
1051146040Stjr
1052146040Stjr#ifdef RE_ENABLE_I18N
1053146040Stjr/* If it is possible to do searching in single byte encoding instead of UTF-8
1054146040Stjr   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1055146040Stjr   DFA nodes where needed.  */
1056146040Stjr
1057146040Stjrstatic void
1058146040Stjroptimize_utf8 (dfa)
1059146040Stjr     re_dfa_t *dfa;
1060146040Stjr{
1061146040Stjr  int node, i, mb_chars = 0, has_period = 0;
1062146040Stjr
1063146040Stjr  for (node = 0; node < dfa->nodes_len; ++node)
1064146040Stjr    switch (dfa->nodes[node].type)
1065146040Stjr      {
1066146040Stjr      case CHARACTER:
1067146040Stjr	if (dfa->nodes[node].opr.c >= 0x80)
1068146040Stjr	  mb_chars = 1;
1069146040Stjr	break;
1070146040Stjr      case ANCHOR:
1071146040Stjr	switch (dfa->nodes[node].opr.idx)
1072146040Stjr	  {
1073146040Stjr	  case LINE_FIRST:
1074146040Stjr	  case LINE_LAST:
1075146040Stjr	  case BUF_FIRST:
1076146040Stjr	  case BUF_LAST:
1077146040Stjr	    break;
1078146040Stjr	  default:
1079146040Stjr	    /* Word anchors etc. cannot be handled.  */
1080146040Stjr	    return;
1081146040Stjr	  }
1082146040Stjr	break;
1083146040Stjr      case OP_PERIOD:
1084146040Stjr        has_period = 1;
1085146040Stjr        break;
1086146040Stjr      case OP_BACK_REF:
1087146040Stjr      case OP_ALT:
1088146040Stjr      case END_OF_RE:
1089146040Stjr      case OP_DUP_ASTERISK:
1090146040Stjr      case OP_OPEN_SUBEXP:
1091146040Stjr      case OP_CLOSE_SUBEXP:
1092146040Stjr	break;
1093146040Stjr      case COMPLEX_BRACKET:
1094146040Stjr	return;
1095146040Stjr      case SIMPLE_BRACKET:
1096146040Stjr	/* Just double check.  */
1097146040Stjr        for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1098146040Stjr	  if (dfa->nodes[node].opr.sbcset[i])
1099146040Stjr	    return;
1100146040Stjr	break;
1101146040Stjr      default:
1102146040Stjr	abort ();
1103146040Stjr      }
1104146040Stjr
1105146040Stjr  if (mb_chars || has_period)
1106146040Stjr    for (node = 0; node < dfa->nodes_len; ++node)
1107146040Stjr      {
1108146040Stjr	if (dfa->nodes[node].type == CHARACTER
1109146040Stjr	    && dfa->nodes[node].opr.c >= 0x80)
1110146040Stjr	  dfa->nodes[node].mb_partial = 0;
1111146040Stjr	else if (dfa->nodes[node].type == OP_PERIOD)
1112146040Stjr	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1113146040Stjr      }
1114146040Stjr
1115146040Stjr  /* The search can be in single byte locale.  */
1116146040Stjr  dfa->mb_cur_max = 1;
1117146040Stjr  dfa->is_utf8 = 0;
1118146040Stjr  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119146040Stjr}
1120146040Stjr#endif
1121146040Stjr
1122146040Stjr/* Analyze the structure tree, and calculate "first", "next", "edest",
1123146040Stjr   "eclosure", and "inveclosure".  */
1124146040Stjr
1125146040Stjrstatic reg_errcode_t
1126146040Stjranalyze (preg)
1127146040Stjr     regex_t *preg;
1128146040Stjr{
1129146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1130146040Stjr  reg_errcode_t ret;
1131146040Stjr
1132146040Stjr  /* Allocate arrays.  */
1133146040Stjr  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1134146040Stjr  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1135146040Stjr  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1136146040Stjr  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1137146040Stjr  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1138146040Stjr	  || dfa->eclosures == NULL, 0))
1139146040Stjr    return REG_ESPACE;
1140146040Stjr
1141146040Stjr  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1142146040Stjr  if (dfa->subexp_map != NULL)
1143146040Stjr    {
1144146040Stjr      int i;
1145146040Stjr      for (i = 0; i < preg->re_nsub; i++)
1146146040Stjr	dfa->subexp_map[i] = i;
1147146040Stjr      preorder (dfa->str_tree, optimize_subexps, dfa);
1148146040Stjr      for (i = 0; i < preg->re_nsub; i++)
1149146040Stjr	if (dfa->subexp_map[i] != i)
1150146040Stjr	  break;
1151146040Stjr      if (i == preg->re_nsub)
1152146040Stjr	{
1153146040Stjr	  free (dfa->subexp_map);
1154146040Stjr	  dfa->subexp_map = NULL;
1155146040Stjr	}
1156146040Stjr    }
1157146040Stjr
1158146040Stjr  ret = postorder (dfa->str_tree, lower_subexps, preg);
1159146040Stjr  if (BE (ret != REG_NOERROR, 0))
1160146040Stjr    return ret;
1161146040Stjr  ret = postorder (dfa->str_tree, calc_first, dfa);
1162146040Stjr  if (BE (ret != REG_NOERROR, 0))
1163146040Stjr    return ret;
1164146040Stjr  preorder (dfa->str_tree, calc_next, dfa);
1165146040Stjr  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1166146040Stjr  if (BE (ret != REG_NOERROR, 0))
1167146040Stjr    return ret;
1168146040Stjr  ret = calc_eclosure (dfa);
1169146040Stjr  if (BE (ret != REG_NOERROR, 0))
1170146040Stjr    return ret;
1171146040Stjr
1172146040Stjr  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1173146040Stjr     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1174146040Stjr  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1175146040Stjr      || dfa->nbackref)
1176146040Stjr    {
1177146040Stjr      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1178146040Stjr      if (BE (dfa->inveclosures == NULL, 0))
1179146040Stjr        return REG_ESPACE;
1180146040Stjr      ret = calc_inveclosure (dfa);
1181146040Stjr    }
1182146040Stjr
1183146040Stjr  return ret;
1184146040Stjr}
1185146040Stjr
1186146040Stjr/* Our parse trees are very unbalanced, so we cannot use a stack to
1187146040Stjr   implement parse tree visits.  Instead, we use parent pointers and
1188146040Stjr   some hairy code in these two functions.  */
1189146040Stjrstatic reg_errcode_t
1190146040Stjrpostorder (root, fn, extra)
1191146040Stjr     bin_tree_t *root;
1192146040Stjr     reg_errcode_t (fn (void *, bin_tree_t *));
1193146040Stjr     void *extra;
1194146040Stjr{
1195146040Stjr  bin_tree_t *node, *prev;
1196146040Stjr
1197146040Stjr  for (node = root; ; )
1198146040Stjr    {
1199146040Stjr      /* Descend down the tree, preferably to the left (or to the right
1200146040Stjr	 if that's the only child).  */
1201146040Stjr      while (node->left || node->right)
1202146040Stjr	if (node->left)
1203146040Stjr          node = node->left;
1204146040Stjr        else
1205146040Stjr          node = node->right;
1206146040Stjr
1207146040Stjr      do
1208146040Stjr	{
1209146040Stjr	  reg_errcode_t err = fn (extra, node);
1210146040Stjr	  if (BE (err != REG_NOERROR, 0))
1211146040Stjr	    return err;
1212146040Stjr          if (node->parent == NULL)
1213146040Stjr	    return REG_NOERROR;
1214146040Stjr	  prev = node;
1215146040Stjr	  node = node->parent;
1216146040Stjr	}
1217146040Stjr      /* Go up while we have a node that is reached from the right.  */
1218146040Stjr      while (node->right == prev || node->right == NULL);
1219146040Stjr      node = node->right;
1220146040Stjr    }
1221146040Stjr}
1222146040Stjr
1223146040Stjrstatic reg_errcode_t
1224146040Stjrpreorder (root, fn, extra)
1225146040Stjr     bin_tree_t *root;
1226146040Stjr     reg_errcode_t (fn (void *, bin_tree_t *));
1227146040Stjr     void *extra;
1228146040Stjr{
1229146040Stjr  bin_tree_t *node;
1230146040Stjr
1231146040Stjr  for (node = root; ; )
1232146040Stjr    {
1233146040Stjr      reg_errcode_t err = fn (extra, node);
1234146040Stjr      if (BE (err != REG_NOERROR, 0))
1235146040Stjr	return err;
1236146040Stjr
1237146040Stjr      /* Go to the left node, or up and to the right.  */
1238146040Stjr      if (node->left)
1239146040Stjr	node = node->left;
1240146040Stjr      else
1241146040Stjr	{
1242146040Stjr	  bin_tree_t *prev = NULL;
1243146040Stjr	  while (node->right == prev || node->right == NULL)
1244146040Stjr	    {
1245146040Stjr	      prev = node;
1246146040Stjr	      node = node->parent;
1247146040Stjr	      if (!node)
1248146040Stjr	        return REG_NOERROR;
1249146040Stjr	    }
1250146040Stjr	  node = node->right;
1251146040Stjr	}
1252146040Stjr    }
1253146040Stjr}
1254146040Stjr
1255146040Stjr/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1256146040Stjr   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1257146040Stjr   backreferences as well.  Requires a preorder visit.  */
1258146040Stjrstatic reg_errcode_t
1259146040Stjroptimize_subexps (extra, node)
1260146040Stjr     void *extra;
1261146040Stjr     bin_tree_t *node;
1262146040Stjr{
1263146040Stjr  re_dfa_t *dfa = (re_dfa_t *) extra;
1264146040Stjr
1265146040Stjr  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1266146040Stjr    {
1267146040Stjr      int idx = node->token.opr.idx;
1268146040Stjr      node->token.opr.idx = dfa->subexp_map[idx];
1269146040Stjr      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1270146040Stjr    }
1271146040Stjr
1272146040Stjr  else if (node->token.type == SUBEXP
1273146040Stjr           && node->left && node->left->token.type == SUBEXP)
1274146040Stjr    {
1275146040Stjr      int other_idx = node->left->token.opr.idx;
1276146040Stjr
1277146040Stjr      node->left = node->left->left;
1278146040Stjr      if (node->left)
1279146040Stjr        node->left->parent = node;
1280146040Stjr
1281146040Stjr      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1282146040Stjr      if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1283146040Stjr	dfa->used_bkref_map &= ~(1 << other_idx);
1284146040Stjr    }
1285146040Stjr
1286146040Stjr  return REG_NOERROR;
1287146040Stjr}
1288146040Stjr
1289146040Stjr/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1290146040Stjr   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1291146040Stjrstatic reg_errcode_t
1292146040Stjrlower_subexps (extra, node)
1293146040Stjr     void *extra;
1294146040Stjr     bin_tree_t *node;
1295146040Stjr{
1296146040Stjr  regex_t *preg = (regex_t *) extra;
1297146040Stjr  reg_errcode_t err = REG_NOERROR;
1298146040Stjr
1299146040Stjr  if (node->left && node->left->token.type == SUBEXP)
1300146040Stjr    {
1301146040Stjr      node->left = lower_subexp (&err, preg, node->left);
1302146040Stjr      if (node->left)
1303146040Stjr	node->left->parent = node;
1304146040Stjr    }
1305146040Stjr  if (node->right && node->right->token.type == SUBEXP)
1306146040Stjr    {
1307146040Stjr      node->right = lower_subexp (&err, preg, node->right);
1308146040Stjr      if (node->right)
1309146040Stjr	node->right->parent = node;
1310146040Stjr    }
1311146040Stjr
1312146040Stjr  return err;
1313146040Stjr}
1314146040Stjr
1315146040Stjrstatic bin_tree_t *
1316146040Stjrlower_subexp (err, preg, node)
1317146040Stjr     reg_errcode_t *err;
1318146040Stjr     regex_t *preg;
1319146040Stjr     bin_tree_t *node;
1320146040Stjr{
1321146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1322146040Stjr  bin_tree_t *body = node->left;
1323146040Stjr  bin_tree_t *op, *cls, *tree1, *tree;
1324146040Stjr
1325146040Stjr  if (preg->no_sub
1326146040Stjr      /* We do not optimize empty subexpressions, because otherwise we may
1327146040Stjr	 have bad CONCAT nodes with NULL children.  This is obviously not
1328146040Stjr	 very common, so we do not lose much.  An example that triggers
1329146040Stjr	 this case is the sed "script" /\(\)/x.  */
1330146040Stjr      && node->left != NULL
1331146040Stjr      && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1332146040Stjr	  || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1333146040Stjr    return node->left;
1334146040Stjr
1335146040Stjr  /* Convert the SUBEXP node to the concatenation of an
1336146040Stjr     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1337146040Stjr  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1338146040Stjr  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1339146040Stjr  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1340146040Stjr  tree = create_tree (dfa, op, tree1, CONCAT);
1341146040Stjr  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1342146040Stjr    {
1343146040Stjr      *err = REG_ESPACE;
1344146040Stjr      return NULL;
1345146040Stjr    }
1346146040Stjr
1347146040Stjr  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1348146040Stjr  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1349146040Stjr  return tree;
1350146040Stjr}
1351146040Stjr
1352146040Stjr/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1353146040Stjr   nodes.  Requires a postorder visit.  */
1354146040Stjrstatic reg_errcode_t
1355146040Stjrcalc_first (extra, node)
1356146040Stjr     void *extra;
1357146040Stjr     bin_tree_t *node;
1358146040Stjr{
1359146040Stjr  re_dfa_t *dfa = (re_dfa_t *) extra;
1360146040Stjr  if (node->token.type == CONCAT)
1361146040Stjr    {
1362146040Stjr      node->first = node->left->first;
1363146040Stjr      node->node_idx = node->left->node_idx;
1364146040Stjr    }
1365146040Stjr  else
1366146040Stjr    {
1367146040Stjr      node->first = node;
1368146040Stjr      node->node_idx = re_dfa_add_node (dfa, node->token);
1369146040Stjr      if (BE (node->node_idx == -1, 0))
1370146040Stjr        return REG_ESPACE;
1371146040Stjr    }
1372146040Stjr  return REG_NOERROR;
1373146040Stjr}
1374146040Stjr
1375146040Stjr/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1376146040Stjrstatic reg_errcode_t
1377146040Stjrcalc_next (extra, node)
1378146040Stjr     void *extra;
1379146040Stjr     bin_tree_t *node;
1380146040Stjr{
1381146040Stjr  switch (node->token.type)
1382146040Stjr    {
1383146040Stjr    case OP_DUP_ASTERISK:
1384146040Stjr      node->left->next = node;
1385146040Stjr      break;
1386146040Stjr    case CONCAT:
1387146040Stjr      node->left->next = node->right->first;
1388146040Stjr      node->right->next = node->next;
1389146040Stjr      break;
1390146040Stjr    default:
1391146040Stjr      if (node->left)
1392146040Stjr	node->left->next = node->next;
1393146040Stjr      if (node->right)
1394146040Stjr        node->right->next = node->next;
1395146040Stjr      break;
1396146040Stjr    }
1397146040Stjr  return REG_NOERROR;
1398146040Stjr}
1399146040Stjr
1400146040Stjr/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1401146040Stjrstatic reg_errcode_t
1402146040Stjrlink_nfa_nodes (extra, node)
1403146040Stjr     void *extra;
1404146040Stjr     bin_tree_t *node;
1405146040Stjr{
1406146040Stjr  re_dfa_t *dfa = (re_dfa_t *) extra;
1407146040Stjr  int idx = node->node_idx;
1408146040Stjr  reg_errcode_t err = REG_NOERROR;
1409146040Stjr
1410146040Stjr  switch (node->token.type)
1411146040Stjr    {
1412146040Stjr    case CONCAT:
1413146040Stjr      break;
1414146040Stjr
1415146040Stjr    case END_OF_RE:
1416146040Stjr      assert (node->next == NULL);
1417146040Stjr      break;
1418146040Stjr
1419146040Stjr    case OP_DUP_ASTERISK:
1420146040Stjr    case OP_ALT:
1421146040Stjr      {
1422146040Stjr	int left, right;
1423146040Stjr	dfa->has_plural_match = 1;
1424146040Stjr	if (node->left != NULL)
1425146040Stjr	  left = node->left->first->node_idx;
1426146040Stjr	else
1427146040Stjr	  left = node->next->node_idx;
1428146040Stjr	if (node->right != NULL)
1429146040Stjr	  right = node->right->first->node_idx;
1430146040Stjr	else
1431146040Stjr	  right = node->next->node_idx;
1432146040Stjr	assert (left > -1);
1433146040Stjr	assert (right > -1);
1434146040Stjr	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1435146040Stjr      }
1436146040Stjr      break;
1437146040Stjr
1438146040Stjr    case ANCHOR:
1439146040Stjr    case OP_OPEN_SUBEXP:
1440146040Stjr    case OP_CLOSE_SUBEXP:
1441146040Stjr      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1442146040Stjr      break;
1443146040Stjr
1444146040Stjr    case OP_BACK_REF:
1445146040Stjr      dfa->nexts[idx] = node->next->node_idx;
1446146040Stjr      if (node->token.type == OP_BACK_REF)
1447146040Stjr	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1448146040Stjr      break;
1449146040Stjr
1450146040Stjr    default:
1451146040Stjr      assert (!IS_EPSILON_NODE (node->token.type));
1452146040Stjr      dfa->nexts[idx] = node->next->node_idx;
1453146040Stjr      break;
1454146040Stjr    }
1455146040Stjr
1456146040Stjr  return err;
1457146040Stjr}
1458146040Stjr
1459146040Stjr/* Duplicate the epsilon closure of the node ROOT_NODE.
1460146040Stjr   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1461146040Stjr   to their own constraint.  */
1462146040Stjr
1463146040Stjrstatic reg_errcode_t
1464146040Stjrduplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1465146040Stjr			init_constraint)
1466146040Stjr     re_dfa_t *dfa;
1467146040Stjr     int top_org_node, top_clone_node, root_node;
1468146040Stjr     unsigned int init_constraint;
1469146040Stjr{
1470146040Stjr  reg_errcode_t err;
1471146040Stjr  int org_node, clone_node, ret;
1472146040Stjr  unsigned int constraint = init_constraint;
1473146040Stjr  for (org_node = top_org_node, clone_node = top_clone_node;;)
1474146040Stjr    {
1475146040Stjr      int org_dest, clone_dest;
1476146040Stjr      if (dfa->nodes[org_node].type == OP_BACK_REF)
1477146040Stjr	{
1478146040Stjr	  /* If the back reference epsilon-transit, its destination must
1479146040Stjr	     also have the constraint.  Then duplicate the epsilon closure
1480146040Stjr	     of the destination of the back reference, and store it in
1481146040Stjr	     edests of the back reference.  */
1482146040Stjr	  org_dest = dfa->nexts[org_node];
1483146040Stjr	  re_node_set_empty (dfa->edests + clone_node);
1484146040Stjr	  err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1485146040Stjr	  if (BE (err != REG_NOERROR, 0))
1486146040Stjr	    return err;
1487146040Stjr	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1488146040Stjr	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489146040Stjr	  if (BE (ret < 0, 0))
1490146040Stjr	    return REG_ESPACE;
1491146040Stjr	}
1492146040Stjr      else if (dfa->edests[org_node].nelem == 0)
1493146040Stjr	{
1494146040Stjr	  /* In case of the node can't epsilon-transit, don't duplicate the
1495146040Stjr	     destination and store the original destination as the
1496146040Stjr	     destination of the node.  */
1497146040Stjr	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1498146040Stjr	  break;
1499146040Stjr	}
1500146040Stjr      else if (dfa->edests[org_node].nelem == 1)
1501146040Stjr	{
1502146040Stjr	  /* In case of the node can epsilon-transit, and it has only one
1503146040Stjr	     destination.  */
1504146040Stjr	  org_dest = dfa->edests[org_node].elems[0];
1505146040Stjr	  re_node_set_empty (dfa->edests + clone_node);
1506146040Stjr	  if (dfa->nodes[org_node].type == ANCHOR)
1507146040Stjr	    {
1508146040Stjr	      /* In case of the node has another constraint, append it.  */
1509146040Stjr	      if (org_node == root_node && clone_node != org_node)
1510146040Stjr		{
1511146040Stjr		  /* ...but if the node is root_node itself, it means the
1512146040Stjr		     epsilon closure have a loop, then tie it to the
1513146040Stjr		     destination of the root_node.  */
1514146040Stjr		  ret = re_node_set_insert (dfa->edests + clone_node,
1515146040Stjr					    org_dest);
1516146040Stjr		  if (BE (ret < 0, 0))
1517146040Stjr		    return REG_ESPACE;
1518146040Stjr		  break;
1519146040Stjr		}
1520146040Stjr	      constraint |= dfa->nodes[org_node].opr.ctx_type;
1521146040Stjr	    }
1522146040Stjr	  err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1523146040Stjr	  if (BE (err != REG_NOERROR, 0))
1524146040Stjr	    return err;
1525146040Stjr	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526146040Stjr	  if (BE (ret < 0, 0))
1527146040Stjr	    return REG_ESPACE;
1528146040Stjr	}
1529146040Stjr      else /* dfa->edests[org_node].nelem == 2 */
1530146040Stjr	{
1531146040Stjr	  /* In case of the node can epsilon-transit, and it has two
1532146040Stjr	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1533146040Stjr	  org_dest = dfa->edests[org_node].elems[0];
1534146040Stjr	  re_node_set_empty (dfa->edests + clone_node);
1535146040Stjr	  /* Search for a duplicated node which satisfies the constraint.  */
1536146040Stjr	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1537146040Stjr	  if (clone_dest == -1)
1538146040Stjr	    {
1539146040Stjr	      /* There are no such a duplicated node, create a new one.  */
1540146040Stjr	      err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1541146040Stjr	      if (BE (err != REG_NOERROR, 0))
1542146040Stjr		return err;
1543146040Stjr	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544146040Stjr	      if (BE (ret < 0, 0))
1545146040Stjr		return REG_ESPACE;
1546146040Stjr	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547146040Stjr					    root_node, constraint);
1548146040Stjr	      if (BE (err != REG_NOERROR, 0))
1549146040Stjr		return err;
1550146040Stjr	    }
1551146040Stjr	  else
1552146040Stjr	    {
1553146040Stjr	      /* There are a duplicated node which satisfy the constraint,
1554146040Stjr		 use it to avoid infinite loop.  */
1555146040Stjr	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556146040Stjr	      if (BE (ret < 0, 0))
1557146040Stjr		return REG_ESPACE;
1558146040Stjr	    }
1559146040Stjr
1560146040Stjr	  org_dest = dfa->edests[org_node].elems[1];
1561146040Stjr	  err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1562146040Stjr	  if (BE (err != REG_NOERROR, 0))
1563146040Stjr	    return err;
1564146040Stjr	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565146040Stjr	  if (BE (ret < 0, 0))
1566146040Stjr	    return REG_ESPACE;
1567146040Stjr	}
1568146040Stjr      org_node = org_dest;
1569146040Stjr      clone_node = clone_dest;
1570146040Stjr    }
1571146040Stjr  return REG_NOERROR;
1572146040Stjr}
1573146040Stjr
1574146040Stjr/* Search for a node which is duplicated from the node ORG_NODE, and
1575146040Stjr   satisfies the constraint CONSTRAINT.  */
1576146040Stjr
1577146040Stjrstatic int
1578146040Stjrsearch_duplicated_node (dfa, org_node, constraint)
1579146040Stjr     re_dfa_t *dfa;
1580146040Stjr     int org_node;
1581146040Stjr     unsigned int constraint;
1582146040Stjr{
1583146040Stjr  int idx;
1584146040Stjr  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1585146040Stjr    {
1586146040Stjr      if (org_node == dfa->org_indices[idx]
1587146040Stjr	  && constraint == dfa->nodes[idx].constraint)
1588146040Stjr	return idx; /* Found.  */
1589146040Stjr    }
1590146040Stjr  return -1; /* Not found.  */
1591146040Stjr}
1592146040Stjr
1593146040Stjr/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1594146040Stjr   The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1595146040Stjr   otherwise return the error code.  */
1596146040Stjr
1597146040Stjrstatic reg_errcode_t
1598146040Stjrduplicate_node (new_idx, dfa, org_idx, constraint)
1599146040Stjr     re_dfa_t *dfa;
1600146040Stjr     int *new_idx, org_idx;
1601146040Stjr     unsigned int constraint;
1602146040Stjr{
1603146040Stjr  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1604146040Stjr  if (BE (dup_idx == -1, 0))
1605146040Stjr    return REG_ESPACE;
1606146040Stjr  dfa->nodes[dup_idx].constraint = constraint;
1607146040Stjr  if (dfa->nodes[org_idx].type == ANCHOR)
1608146040Stjr    dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1609146040Stjr  dfa->nodes[dup_idx].duplicated = 1;
1610146040Stjr
1611146040Stjr  /* Store the index of the original node.  */
1612146040Stjr  dfa->org_indices[dup_idx] = org_idx;
1613146040Stjr  *new_idx = dup_idx;
1614146040Stjr  return REG_NOERROR;
1615146040Stjr}
1616146040Stjr
1617146040Stjrstatic reg_errcode_t
1618146040Stjrcalc_inveclosure (dfa)
1619146040Stjr     re_dfa_t *dfa;
1620146040Stjr{
1621146040Stjr  int src, idx, ret;
1622146040Stjr  for (idx = 0; idx < dfa->nodes_len; ++idx)
1623146040Stjr    re_node_set_init_empty (dfa->inveclosures + idx);
1624146040Stjr
1625146040Stjr  for (src = 0; src < dfa->nodes_len; ++src)
1626146040Stjr    {
1627146040Stjr      int *elems = dfa->eclosures[src].elems;
1628146040Stjr      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1629146040Stjr	{
1630146040Stjr	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1631146040Stjr	  if (BE (ret == -1, 0))
1632146040Stjr	    return REG_ESPACE;
1633146040Stjr	}
1634146040Stjr    }
1635146040Stjr
1636146040Stjr  return REG_NOERROR;
1637146040Stjr}
1638146040Stjr
1639146040Stjr/* Calculate "eclosure" for all the node in DFA.  */
1640146040Stjr
1641146040Stjrstatic reg_errcode_t
1642146040Stjrcalc_eclosure (dfa)
1643146040Stjr     re_dfa_t *dfa;
1644146040Stjr{
1645146040Stjr  int node_idx, incomplete;
1646146040Stjr#ifdef DEBUG
1647146040Stjr  assert (dfa->nodes_len > 0);
1648146040Stjr#endif
1649146040Stjr  incomplete = 0;
1650146040Stjr  /* For each nodes, calculate epsilon closure.  */
1651146040Stjr  for (node_idx = 0; ; ++node_idx)
1652146040Stjr    {
1653146040Stjr      reg_errcode_t err;
1654146040Stjr      re_node_set eclosure_elem;
1655146040Stjr      if (node_idx == dfa->nodes_len)
1656146040Stjr	{
1657146040Stjr	  if (!incomplete)
1658146040Stjr	    break;
1659146040Stjr	  incomplete = 0;
1660146040Stjr	  node_idx = 0;
1661146040Stjr	}
1662146040Stjr
1663146040Stjr#ifdef DEBUG
1664146040Stjr      assert (dfa->eclosures[node_idx].nelem != -1);
1665146040Stjr#endif
1666146040Stjr
1667146040Stjr      /* If we have already calculated, skip it.  */
1668146040Stjr      if (dfa->eclosures[node_idx].nelem != 0)
1669146040Stjr	continue;
1670146040Stjr      /* Calculate epsilon closure of `node_idx'.  */
1671146040Stjr      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1672146040Stjr      if (BE (err != REG_NOERROR, 0))
1673146040Stjr	return err;
1674146040Stjr
1675146040Stjr      if (dfa->eclosures[node_idx].nelem == 0)
1676146040Stjr	{
1677146040Stjr	  incomplete = 1;
1678146040Stjr	  re_node_set_free (&eclosure_elem);
1679146040Stjr	}
1680146040Stjr    }
1681146040Stjr  return REG_NOERROR;
1682146040Stjr}
1683146040Stjr
1684146040Stjr/* Calculate epsilon closure of NODE.  */
1685146040Stjr
1686146040Stjrstatic reg_errcode_t
1687146040Stjrcalc_eclosure_iter (new_set, dfa, node, root)
1688146040Stjr     re_node_set *new_set;
1689146040Stjr     re_dfa_t *dfa;
1690146040Stjr     int node, root;
1691146040Stjr{
1692146040Stjr  reg_errcode_t err;
1693146040Stjr  unsigned int constraint;
1694146040Stjr  int i, incomplete;
1695146040Stjr  re_node_set eclosure;
1696146040Stjr  incomplete = 0;
1697146040Stjr  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1698146040Stjr  if (BE (err != REG_NOERROR, 0))
1699146040Stjr    return err;
1700146040Stjr
1701146040Stjr  /* This indicates that we are calculating this node now.
1702146040Stjr     We reference this value to avoid infinite loop.  */
1703146040Stjr  dfa->eclosures[node].nelem = -1;
1704146040Stjr
1705146040Stjr  constraint = ((dfa->nodes[node].type == ANCHOR)
1706146040Stjr		? dfa->nodes[node].opr.ctx_type : 0);
1707146040Stjr  /* If the current node has constraints, duplicate all nodes.
1708146040Stjr     Since they must inherit the constraints.  */
1709146040Stjr  if (constraint
1710146040Stjr      && dfa->edests[node].nelem
1711146040Stjr      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1712146040Stjr    {
1713146040Stjr      int org_node, cur_node;
1714146040Stjr      org_node = cur_node = node;
1715146040Stjr      err = duplicate_node_closure (dfa, node, node, node, constraint);
1716146040Stjr      if (BE (err != REG_NOERROR, 0))
1717146040Stjr	return err;
1718146040Stjr    }
1719146040Stjr
1720146040Stjr  /* Expand each epsilon destination nodes.  */
1721146040Stjr  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1722146040Stjr    for (i = 0; i < dfa->edests[node].nelem; ++i)
1723146040Stjr      {
1724146040Stjr	re_node_set eclosure_elem;
1725146040Stjr	int edest = dfa->edests[node].elems[i];
1726146040Stjr	/* If calculating the epsilon closure of `edest' is in progress,
1727146040Stjr	   return intermediate result.  */
1728146040Stjr	if (dfa->eclosures[edest].nelem == -1)
1729146040Stjr	  {
1730146040Stjr	    incomplete = 1;
1731146040Stjr	    continue;
1732146040Stjr	  }
1733146040Stjr	/* If we haven't calculated the epsilon closure of `edest' yet,
1734146040Stjr	   calculate now. Otherwise use calculated epsilon closure.  */
1735146040Stjr	if (dfa->eclosures[edest].nelem == 0)
1736146040Stjr	  {
1737146040Stjr	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1738146040Stjr	    if (BE (err != REG_NOERROR, 0))
1739146040Stjr	      return err;
1740146040Stjr	  }
1741146040Stjr	else
1742146040Stjr	  eclosure_elem = dfa->eclosures[edest];
1743146040Stjr	/* Merge the epsilon closure of `edest'.  */
1744146040Stjr	re_node_set_merge (&eclosure, &eclosure_elem);
1745146040Stjr	/* If the epsilon closure of `edest' is incomplete,
1746146040Stjr	   the epsilon closure of this node is also incomplete.  */
1747146040Stjr	if (dfa->eclosures[edest].nelem == 0)
1748146040Stjr	  {
1749146040Stjr	    incomplete = 1;
1750146040Stjr	    re_node_set_free (&eclosure_elem);
1751146040Stjr	  }
1752146040Stjr      }
1753146040Stjr
1754146040Stjr  /* Epsilon closures include itself.  */
1755146040Stjr  re_node_set_insert (&eclosure, node);
1756146040Stjr  if (incomplete && !root)
1757146040Stjr    dfa->eclosures[node].nelem = 0;
1758146040Stjr  else
1759146040Stjr    dfa->eclosures[node] = eclosure;
1760146040Stjr  *new_set = eclosure;
1761146040Stjr  return REG_NOERROR;
1762146040Stjr}
1763146040Stjr
1764146040Stjr/* Functions for token which are used in the parser.  */
1765146040Stjr
1766146040Stjr/* Fetch a token from INPUT.
1767146040Stjr   We must not use this function inside bracket expressions.  */
1768146040Stjr
1769146040Stjrstatic void
1770146040Stjrfetch_token (result, input, syntax)
1771146040Stjr     re_token_t *result;
1772146040Stjr     re_string_t *input;
1773146040Stjr     reg_syntax_t syntax;
1774146040Stjr{
1775146040Stjr  re_string_skip_bytes (input, peek_token (result, input, syntax));
1776146040Stjr}
1777146040Stjr
1778146040Stjr/* Peek a token from INPUT, and return the length of the token.
1779146040Stjr   We must not use this function inside bracket expressions.  */
1780146040Stjr
1781146040Stjrstatic int
1782146040Stjrpeek_token (token, input, syntax)
1783146040Stjr     re_token_t *token;
1784146040Stjr     re_string_t *input;
1785146040Stjr     reg_syntax_t syntax;
1786146040Stjr{
1787146040Stjr  unsigned char c;
1788146040Stjr
1789146040Stjr  if (re_string_eoi (input))
1790146040Stjr    {
1791146040Stjr      token->type = END_OF_RE;
1792146040Stjr      return 0;
1793146040Stjr    }
1794146040Stjr
1795146040Stjr  c = re_string_peek_byte (input, 0);
1796146040Stjr  token->opr.c = c;
1797146040Stjr
1798146040Stjr  token->word_char = 0;
1799146040Stjr#ifdef RE_ENABLE_I18N
1800146040Stjr  token->mb_partial = 0;
1801146040Stjr  if (input->mb_cur_max > 1 &&
1802146040Stjr      !re_string_first_byte (input, re_string_cur_idx (input)))
1803146040Stjr    {
1804146040Stjr      token->type = CHARACTER;
1805146040Stjr      token->mb_partial = 1;
1806146040Stjr      return 1;
1807146040Stjr    }
1808146040Stjr#endif
1809146040Stjr  if (c == '\\')
1810146040Stjr    {
1811146040Stjr      unsigned char c2;
1812146040Stjr      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1813146040Stjr	{
1814146040Stjr	  token->type = BACK_SLASH;
1815146040Stjr	  return 1;
1816146040Stjr	}
1817146040Stjr
1818146040Stjr      c2 = re_string_peek_byte_case (input, 1);
1819146040Stjr      token->opr.c = c2;
1820146040Stjr      token->type = CHARACTER;
1821146040Stjr#ifdef RE_ENABLE_I18N
1822146040Stjr      if (input->mb_cur_max > 1)
1823146040Stjr	{
1824146040Stjr	  wint_t wc = re_string_wchar_at (input,
1825146040Stjr					  re_string_cur_idx (input) + 1);
1826146040Stjr	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1827146040Stjr	}
1828146040Stjr      else
1829146040Stjr#endif
1830146040Stjr	token->word_char = IS_WORD_CHAR (c2) != 0;
1831146040Stjr
1832146040Stjr      switch (c2)
1833146040Stjr	{
1834146040Stjr	case '|':
1835146040Stjr	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1836146040Stjr	    token->type = OP_ALT;
1837146040Stjr	  break;
1838146040Stjr	case '1': case '2': case '3': case '4': case '5':
1839146040Stjr	case '6': case '7': case '8': case '9':
1840146040Stjr	  if (!(syntax & RE_NO_BK_REFS))
1841146040Stjr	    {
1842146040Stjr	      token->type = OP_BACK_REF;
1843146040Stjr	      token->opr.idx = c2 - '1';
1844146040Stjr	    }
1845146040Stjr	  break;
1846146040Stjr	case '<':
1847146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1848146040Stjr	    {
1849146040Stjr	      token->type = ANCHOR;
1850146040Stjr	      token->opr.ctx_type = WORD_FIRST;
1851146040Stjr	    }
1852146040Stjr	  break;
1853146040Stjr	case '>':
1854146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1855146040Stjr	    {
1856146040Stjr	      token->type = ANCHOR;
1857146040Stjr	      token->opr.ctx_type = WORD_LAST;
1858146040Stjr	    }
1859146040Stjr	  break;
1860146040Stjr	case 'b':
1861146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1862146040Stjr	    {
1863146040Stjr	      token->type = ANCHOR;
1864146040Stjr	      token->opr.ctx_type = WORD_DELIM;
1865146040Stjr	    }
1866146040Stjr	  break;
1867146040Stjr	case 'B':
1868146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1869146040Stjr	    {
1870146040Stjr	      token->type = ANCHOR;
1871146040Stjr	      token->opr.ctx_type = NOT_WORD_DELIM;
1872146040Stjr	    }
1873146040Stjr	  break;
1874146040Stjr	case 'w':
1875146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1876146040Stjr	    token->type = OP_WORD;
1877146040Stjr	  break;
1878146040Stjr	case 'W':
1879146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1880146040Stjr	    token->type = OP_NOTWORD;
1881146040Stjr	  break;
1882146040Stjr	case 's':
1883146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1884146040Stjr	    token->type = OP_SPACE;
1885146040Stjr	  break;
1886146040Stjr	case 'S':
1887146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1888146040Stjr	    token->type = OP_NOTSPACE;
1889146040Stjr	  break;
1890146040Stjr	case '`':
1891146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1892146040Stjr	    {
1893146040Stjr	      token->type = ANCHOR;
1894146040Stjr	      token->opr.ctx_type = BUF_FIRST;
1895146040Stjr	    }
1896146040Stjr	  break;
1897146040Stjr	case '\'':
1898146040Stjr	  if (!(syntax & RE_NO_GNU_OPS))
1899146040Stjr	    {
1900146040Stjr	      token->type = ANCHOR;
1901146040Stjr	      token->opr.ctx_type = BUF_LAST;
1902146040Stjr	    }
1903146040Stjr	  break;
1904146040Stjr	case '(':
1905146040Stjr	  if (!(syntax & RE_NO_BK_PARENS))
1906146040Stjr	    token->type = OP_OPEN_SUBEXP;
1907146040Stjr	  break;
1908146040Stjr	case ')':
1909146040Stjr	  if (!(syntax & RE_NO_BK_PARENS))
1910146040Stjr	    token->type = OP_CLOSE_SUBEXP;
1911146040Stjr	  break;
1912146040Stjr	case '+':
1913146040Stjr	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914146040Stjr	    token->type = OP_DUP_PLUS;
1915146040Stjr	  break;
1916146040Stjr	case '?':
1917146040Stjr	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1918146040Stjr	    token->type = OP_DUP_QUESTION;
1919146040Stjr	  break;
1920146040Stjr	case '{':
1921146040Stjr	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922146040Stjr	    token->type = OP_OPEN_DUP_NUM;
1923146040Stjr	  break;
1924146040Stjr	case '}':
1925146040Stjr	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1926146040Stjr	    token->type = OP_CLOSE_DUP_NUM;
1927146040Stjr	  break;
1928146040Stjr	default:
1929146040Stjr	  break;
1930146040Stjr	}
1931146040Stjr      return 2;
1932146040Stjr    }
1933146040Stjr
1934146040Stjr  token->type = CHARACTER;
1935146040Stjr#ifdef RE_ENABLE_I18N
1936146040Stjr  if (input->mb_cur_max > 1)
1937146040Stjr    {
1938146040Stjr      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1939146040Stjr      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1940146040Stjr    }
1941146040Stjr  else
1942146040Stjr#endif
1943146040Stjr    token->word_char = IS_WORD_CHAR (token->opr.c);
1944146040Stjr
1945146040Stjr  switch (c)
1946146040Stjr    {
1947146040Stjr    case '\n':
1948146040Stjr      if (syntax & RE_NEWLINE_ALT)
1949146040Stjr	token->type = OP_ALT;
1950146040Stjr      break;
1951146040Stjr    case '|':
1952146040Stjr      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1953146040Stjr	token->type = OP_ALT;
1954146040Stjr      break;
1955146040Stjr    case '*':
1956146040Stjr      token->type = OP_DUP_ASTERISK;
1957146040Stjr      break;
1958146040Stjr    case '+':
1959146040Stjr      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960146040Stjr	token->type = OP_DUP_PLUS;
1961146040Stjr      break;
1962146040Stjr    case '?':
1963146040Stjr      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1964146040Stjr	token->type = OP_DUP_QUESTION;
1965146040Stjr      break;
1966146040Stjr    case '{':
1967146040Stjr      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968146040Stjr	token->type = OP_OPEN_DUP_NUM;
1969146040Stjr      break;
1970146040Stjr    case '}':
1971146040Stjr      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1972146040Stjr	token->type = OP_CLOSE_DUP_NUM;
1973146040Stjr      break;
1974146040Stjr    case '(':
1975146040Stjr      if (syntax & RE_NO_BK_PARENS)
1976146040Stjr	token->type = OP_OPEN_SUBEXP;
1977146040Stjr      break;
1978146040Stjr    case ')':
1979146040Stjr      if (syntax & RE_NO_BK_PARENS)
1980146040Stjr	token->type = OP_CLOSE_SUBEXP;
1981146040Stjr      break;
1982146040Stjr    case '[':
1983146040Stjr      token->type = OP_OPEN_BRACKET;
1984146040Stjr      break;
1985146040Stjr    case '.':
1986146040Stjr      token->type = OP_PERIOD;
1987146040Stjr      break;
1988146040Stjr    case '^':
1989146040Stjr      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1990146040Stjr	  re_string_cur_idx (input) != 0)
1991146040Stjr	{
1992146040Stjr	  char prev = re_string_peek_byte (input, -1);
1993146040Stjr	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1994146040Stjr	    break;
1995146040Stjr	}
1996146040Stjr      token->type = ANCHOR;
1997146040Stjr      token->opr.ctx_type = LINE_FIRST;
1998146040Stjr      break;
1999146040Stjr    case '$':
2000146040Stjr      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2001146040Stjr	  re_string_cur_idx (input) + 1 != re_string_length (input))
2002146040Stjr	{
2003146040Stjr	  re_token_t next;
2004146040Stjr	  re_string_skip_bytes (input, 1);
2005146040Stjr	  peek_token (&next, input, syntax);
2006146040Stjr	  re_string_skip_bytes (input, -1);
2007146040Stjr	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2008146040Stjr	    break;
2009146040Stjr	}
2010146040Stjr      token->type = ANCHOR;
2011146040Stjr      token->opr.ctx_type = LINE_LAST;
2012146040Stjr      break;
2013146040Stjr    default:
2014146040Stjr      break;
2015146040Stjr    }
2016146040Stjr  return 1;
2017146040Stjr}
2018146040Stjr
2019146040Stjr/* Peek a token from INPUT, and return the length of the token.
2020146040Stjr   We must not use this function out of bracket expressions.  */
2021146040Stjr
2022146040Stjrstatic int
2023146040Stjrpeek_token_bracket (token, input, syntax)
2024146040Stjr     re_token_t *token;
2025146040Stjr     re_string_t *input;
2026146040Stjr     reg_syntax_t syntax;
2027146040Stjr{
2028146040Stjr  unsigned char c;
2029146040Stjr  if (re_string_eoi (input))
2030146040Stjr    {
2031146040Stjr      token->type = END_OF_RE;
2032146040Stjr      return 0;
2033146040Stjr    }
2034146040Stjr  c = re_string_peek_byte (input, 0);
2035146040Stjr  token->opr.c = c;
2036146040Stjr
2037146040Stjr#ifdef RE_ENABLE_I18N
2038146040Stjr  if (input->mb_cur_max > 1 &&
2039146040Stjr      !re_string_first_byte (input, re_string_cur_idx (input)))
2040146040Stjr    {
2041146040Stjr      token->type = CHARACTER;
2042146040Stjr      return 1;
2043146040Stjr    }
2044146040Stjr#endif /* RE_ENABLE_I18N */
2045146040Stjr
2046146040Stjr  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2047146040Stjr      && re_string_cur_idx (input) + 1 < re_string_length (input))
2048146040Stjr    {
2049146040Stjr      /* In this case, '\' escape a character.  */
2050146040Stjr      unsigned char c2;
2051146040Stjr      re_string_skip_bytes (input, 1);
2052146040Stjr      c2 = re_string_peek_byte (input, 0);
2053146040Stjr      token->opr.c = c2;
2054146040Stjr      token->type = CHARACTER;
2055146040Stjr      return 1;
2056146040Stjr    }
2057146040Stjr  if (c == '[') /* '[' is a special char in a bracket exps.  */
2058146040Stjr    {
2059146040Stjr      unsigned char c2;
2060146040Stjr      int token_len;
2061146040Stjr      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2062146040Stjr	c2 = re_string_peek_byte (input, 1);
2063146040Stjr      else
2064146040Stjr	c2 = 0;
2065146040Stjr      token->opr.c = c2;
2066146040Stjr      token_len = 2;
2067146040Stjr      switch (c2)
2068146040Stjr	{
2069146040Stjr	case '.':
2070146040Stjr	  token->type = OP_OPEN_COLL_ELEM;
2071146040Stjr	  break;
2072146040Stjr	case '=':
2073146040Stjr	  token->type = OP_OPEN_EQUIV_CLASS;
2074146040Stjr	  break;
2075146040Stjr	case ':':
2076146040Stjr	  if (syntax & RE_CHAR_CLASSES)
2077146040Stjr	    {
2078146040Stjr	      token->type = OP_OPEN_CHAR_CLASS;
2079146040Stjr	      break;
2080146040Stjr	    }
2081146040Stjr	  /* else fall through.  */
2082146040Stjr	default:
2083146040Stjr	  token->type = CHARACTER;
2084146040Stjr	  token->opr.c = c;
2085146040Stjr	  token_len = 1;
2086146040Stjr	  break;
2087146040Stjr	}
2088146040Stjr      return token_len;
2089146040Stjr    }
2090146040Stjr  switch (c)
2091146040Stjr    {
2092146040Stjr    case '-':
2093146040Stjr      token->type = OP_CHARSET_RANGE;
2094146040Stjr      break;
2095146040Stjr    case ']':
2096146040Stjr      token->type = OP_CLOSE_BRACKET;
2097146040Stjr      break;
2098146040Stjr    case '^':
2099146040Stjr      token->type = OP_NON_MATCH_LIST;
2100146040Stjr      break;
2101146040Stjr    default:
2102146040Stjr      token->type = CHARACTER;
2103146040Stjr    }
2104146040Stjr  return 1;
2105146040Stjr}
2106146040Stjr
2107146040Stjr/* Functions for parser.  */
2108146040Stjr
2109146040Stjr/* Entry point of the parser.
2110146040Stjr   Parse the regular expression REGEXP and return the structure tree.
2111146040Stjr   If an error is occured, ERR is set by error code, and return NULL.
2112146040Stjr   This function build the following tree, from regular expression <reg_exp>:
2113146040Stjr	   CAT
2114146040Stjr	   / \
2115146040Stjr	  /   \
2116146040Stjr   <reg_exp>  EOR
2117146040Stjr
2118146040Stjr   CAT means concatenation.
2119146040Stjr   EOR means end of regular expression.  */
2120146040Stjr
2121146040Stjrstatic bin_tree_t *
2122146040Stjrparse (regexp, preg, syntax, err)
2123146040Stjr     re_string_t *regexp;
2124146040Stjr     regex_t *preg;
2125146040Stjr     reg_syntax_t syntax;
2126146040Stjr     reg_errcode_t *err;
2127146040Stjr{
2128146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2129146040Stjr  bin_tree_t *tree, *eor, *root;
2130146040Stjr  re_token_t current_token;
2131146040Stjr  dfa->syntax = syntax;
2132146040Stjr  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2133146040Stjr  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2134146040Stjr  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2135146040Stjr    return NULL;
2136146040Stjr  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2137146040Stjr  if (tree != NULL)
2138146040Stjr    root = create_tree (dfa, tree, eor, CONCAT);
2139146040Stjr  else
2140146040Stjr    root = eor;
2141146040Stjr  if (BE (eor == NULL || root == NULL, 0))
2142146040Stjr    {
2143146040Stjr      *err = REG_ESPACE;
2144146040Stjr      return NULL;
2145146040Stjr    }
2146146040Stjr  return root;
2147146040Stjr}
2148146040Stjr
2149146040Stjr/* This function build the following tree, from regular expression
2150146040Stjr   <branch1>|<branch2>:
2151146040Stjr	   ALT
2152146040Stjr	   / \
2153146040Stjr	  /   \
2154146040Stjr   <branch1> <branch2>
2155146040Stjr
2156146040Stjr   ALT means alternative, which represents the operator `|'.  */
2157146040Stjr
2158146040Stjrstatic bin_tree_t *
2159146040Stjrparse_reg_exp (regexp, preg, token, syntax, nest, err)
2160146040Stjr     re_string_t *regexp;
2161146040Stjr     regex_t *preg;
2162146040Stjr     re_token_t *token;
2163146040Stjr     reg_syntax_t syntax;
2164146040Stjr     int nest;
2165146040Stjr     reg_errcode_t *err;
2166146040Stjr{
2167146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2168146040Stjr  bin_tree_t *tree, *branch = NULL;
2169146040Stjr  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2170146040Stjr  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2171146040Stjr    return NULL;
2172146040Stjr
2173146040Stjr  while (token->type == OP_ALT)
2174146040Stjr    {
2175146040Stjr      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2176146040Stjr      if (token->type != OP_ALT && token->type != END_OF_RE
2177146040Stjr	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2178146040Stjr	{
2179146040Stjr	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2180146040Stjr	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2181146040Stjr	    return NULL;
2182146040Stjr	}
2183146040Stjr      else
2184146040Stjr	branch = NULL;
2185146040Stjr      tree = create_tree (dfa, tree, branch, OP_ALT);
2186146040Stjr      if (BE (tree == NULL, 0))
2187146040Stjr	{
2188146040Stjr	  *err = REG_ESPACE;
2189146040Stjr	  return NULL;
2190146040Stjr	}
2191146040Stjr    }
2192146040Stjr  return tree;
2193146040Stjr}
2194146040Stjr
2195146040Stjr/* This function build the following tree, from regular expression
2196146040Stjr   <exp1><exp2>:
2197146040Stjr	CAT
2198146040Stjr	/ \
2199146040Stjr       /   \
2200146040Stjr   <exp1> <exp2>
2201146040Stjr
2202146040Stjr   CAT means concatenation.  */
2203146040Stjr
2204146040Stjrstatic bin_tree_t *
2205146040Stjrparse_branch (regexp, preg, token, syntax, nest, err)
2206146040Stjr     re_string_t *regexp;
2207146040Stjr     regex_t *preg;
2208146040Stjr     re_token_t *token;
2209146040Stjr     reg_syntax_t syntax;
2210146040Stjr     int nest;
2211146040Stjr     reg_errcode_t *err;
2212146040Stjr{
2213146040Stjr  bin_tree_t *tree, *exp;
2214146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2215146040Stjr  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2216146040Stjr  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2217146040Stjr    return NULL;
2218146040Stjr
2219146040Stjr  while (token->type != OP_ALT && token->type != END_OF_RE
2220146040Stjr	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2221146040Stjr    {
2222146040Stjr      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2223146040Stjr      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2224146040Stjr	{
2225146040Stjr	  return NULL;
2226146040Stjr	}
2227146040Stjr      if (tree != NULL && exp != NULL)
2228146040Stjr	{
2229146040Stjr	  tree = create_tree (dfa, tree, exp, CONCAT);
2230146040Stjr	  if (tree == NULL)
2231146040Stjr	    {
2232146040Stjr	      *err = REG_ESPACE;
2233146040Stjr	      return NULL;
2234146040Stjr	    }
2235146040Stjr	}
2236146040Stjr      else if (tree == NULL)
2237146040Stjr	tree = exp;
2238146040Stjr      /* Otherwise exp == NULL, we don't need to create new tree.  */
2239146040Stjr    }
2240146040Stjr  return tree;
2241146040Stjr}
2242146040Stjr
2243146040Stjr/* This function build the following tree, from regular expression a*:
2244146040Stjr	 *
2245146040Stjr	 |
2246146040Stjr	 a
2247146040Stjr*/
2248146040Stjr
2249146040Stjrstatic bin_tree_t *
2250146040Stjrparse_expression (regexp, preg, token, syntax, nest, err)
2251146040Stjr     re_string_t *regexp;
2252146040Stjr     regex_t *preg;
2253146040Stjr     re_token_t *token;
2254146040Stjr     reg_syntax_t syntax;
2255146040Stjr     int nest;
2256146040Stjr     reg_errcode_t *err;
2257146040Stjr{
2258146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2259146040Stjr  bin_tree_t *tree;
2260146040Stjr  switch (token->type)
2261146040Stjr    {
2262146040Stjr    case CHARACTER:
2263146040Stjr      tree = create_token_tree (dfa, NULL, NULL, token);
2264146040Stjr      if (BE (tree == NULL, 0))
2265146040Stjr	{
2266146040Stjr	  *err = REG_ESPACE;
2267146040Stjr	  return NULL;
2268146040Stjr	}
2269146040Stjr#ifdef RE_ENABLE_I18N
2270146040Stjr      if (dfa->mb_cur_max > 1)
2271146040Stjr	{
2272146040Stjr	  while (!re_string_eoi (regexp)
2273146040Stjr		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2274146040Stjr	    {
2275146040Stjr	      bin_tree_t *mbc_remain;
2276146040Stjr	      fetch_token (token, regexp, syntax);
2277146040Stjr	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2278146040Stjr	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2279146040Stjr	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2280146040Stjr		{
2281146040Stjr		  *err = REG_ESPACE;
2282146040Stjr		  return NULL;
2283146040Stjr		}
2284146040Stjr	    }
2285146040Stjr	}
2286146040Stjr#endif
2287146040Stjr      break;
2288146040Stjr    case OP_OPEN_SUBEXP:
2289146040Stjr      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2290146040Stjr      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2291146040Stjr	return NULL;
2292146040Stjr      break;
2293146040Stjr    case OP_OPEN_BRACKET:
2294146040Stjr      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2295146040Stjr      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2296146040Stjr	return NULL;
2297146040Stjr      break;
2298146040Stjr    case OP_BACK_REF:
2299146040Stjr      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2300146040Stjr	{
2301146040Stjr	  *err = REG_ESUBREG;
2302146040Stjr	  return NULL;
2303146040Stjr	}
2304146040Stjr      dfa->used_bkref_map |= 1 << token->opr.idx;
2305146040Stjr      tree = create_token_tree (dfa, NULL, NULL, token);
2306146040Stjr      if (BE (tree == NULL, 0))
2307146040Stjr	{
2308146040Stjr	  *err = REG_ESPACE;
2309146040Stjr	  return NULL;
2310146040Stjr	}
2311146040Stjr      ++dfa->nbackref;
2312146040Stjr      dfa->has_mb_node = 1;
2313146040Stjr      break;
2314146040Stjr    case OP_OPEN_DUP_NUM:
2315146040Stjr      if (syntax & RE_CONTEXT_INVALID_DUP)
2316146040Stjr	{
2317146040Stjr	  *err = REG_BADRPT;
2318146040Stjr	  return NULL;
2319146040Stjr	}
2320146040Stjr      /* FALLTHROUGH */
2321146040Stjr    case OP_DUP_ASTERISK:
2322146040Stjr    case OP_DUP_PLUS:
2323146040Stjr    case OP_DUP_QUESTION:
2324146040Stjr      if (syntax & RE_CONTEXT_INVALID_OPS)
2325146040Stjr	{
2326146040Stjr	  *err = REG_BADRPT;
2327146040Stjr	  return NULL;
2328146040Stjr	}
2329146040Stjr      else if (syntax & RE_CONTEXT_INDEP_OPS)
2330146040Stjr	{
2331146040Stjr	  fetch_token (token, regexp, syntax);
2332146040Stjr	  return parse_expression (regexp, preg, token, syntax, nest, err);
2333146040Stjr	}
2334146040Stjr      /* else fall through  */
2335146040Stjr    case OP_CLOSE_SUBEXP:
2336146040Stjr      if ((token->type == OP_CLOSE_SUBEXP) &&
2337146040Stjr	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2338146040Stjr	{
2339146040Stjr	  *err = REG_ERPAREN;
2340146040Stjr	  return NULL;
2341146040Stjr	}
2342146040Stjr      /* else fall through  */
2343146040Stjr    case OP_CLOSE_DUP_NUM:
2344146040Stjr      /* We treat it as a normal character.  */
2345146040Stjr
2346146040Stjr      /* Then we can these characters as normal characters.  */
2347146040Stjr      token->type = CHARACTER;
2348146040Stjr      /* mb_partial and word_char bits should be initialized already
2349146040Stjr	 by peek_token.  */
2350146040Stjr      tree = create_token_tree (dfa, NULL, NULL, token);
2351146040Stjr      if (BE (tree == NULL, 0))
2352146040Stjr	{
2353146040Stjr	  *err = REG_ESPACE;
2354146040Stjr	  return NULL;
2355146040Stjr	}
2356146040Stjr      break;
2357146040Stjr    case ANCHOR:
2358146040Stjr      if ((token->opr.ctx_type
2359146040Stjr	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2360146040Stjr	  && dfa->word_ops_used == 0)
2361146040Stjr	init_word_char (dfa);
2362146040Stjr      if (token->opr.ctx_type == WORD_DELIM
2363146040Stjr          || token->opr.ctx_type == NOT_WORD_DELIM)
2364146040Stjr	{
2365146040Stjr	  bin_tree_t *tree_first, *tree_last;
2366146040Stjr	  if (token->opr.ctx_type == WORD_DELIM)
2367146040Stjr	    {
2368146040Stjr	      token->opr.ctx_type = WORD_FIRST;
2369146040Stjr	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2370146040Stjr	      token->opr.ctx_type = WORD_LAST;
2371146040Stjr            }
2372146040Stjr          else
2373146040Stjr            {
2374146040Stjr	      token->opr.ctx_type = INSIDE_WORD;
2375146040Stjr	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2376146040Stjr	      token->opr.ctx_type = INSIDE_NOTWORD;
2377146040Stjr            }
2378146040Stjr	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2379146040Stjr	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2380146040Stjr	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2381146040Stjr	    {
2382146040Stjr	      *err = REG_ESPACE;
2383146040Stjr	      return NULL;
2384146040Stjr	    }
2385146040Stjr	}
2386146040Stjr      else
2387146040Stjr	{
2388146040Stjr	  tree = create_token_tree (dfa, NULL, NULL, token);
2389146040Stjr	  if (BE (tree == NULL, 0))
2390146040Stjr	    {
2391146040Stjr	      *err = REG_ESPACE;
2392146040Stjr	      return NULL;
2393146040Stjr	    }
2394146040Stjr	}
2395146040Stjr      /* We must return here, since ANCHORs can't be followed
2396146040Stjr	 by repetition operators.
2397146040Stjr	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2398146040Stjr	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2399146040Stjr      fetch_token (token, regexp, syntax);
2400146040Stjr      return tree;
2401146040Stjr    case OP_PERIOD:
2402146040Stjr      tree = create_token_tree (dfa, NULL, NULL, token);
2403146040Stjr      if (BE (tree == NULL, 0))
2404146040Stjr	{
2405146040Stjr	  *err = REG_ESPACE;
2406146040Stjr	  return NULL;
2407146040Stjr	}
2408146040Stjr      if (dfa->mb_cur_max > 1)
2409146040Stjr	dfa->has_mb_node = 1;
2410146040Stjr      break;
2411146040Stjr    case OP_WORD:
2412146040Stjr    case OP_NOTWORD:
2413146040Stjr      tree = build_charclass_op (dfa, regexp->trans,
2414146040Stjr				 (const unsigned char *) "alnum",
2415146040Stjr				 (const unsigned char *) "_",
2416146040Stjr				 token->type == OP_NOTWORD, err);
2417146040Stjr      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418146040Stjr	return NULL;
2419146040Stjr      break;
2420146040Stjr    case OP_SPACE:
2421146040Stjr    case OP_NOTSPACE:
2422146040Stjr      tree = build_charclass_op (dfa, regexp->trans,
2423146040Stjr				 (const unsigned char *) "space",
2424146040Stjr				 (const unsigned char *) "",
2425146040Stjr				 token->type == OP_NOTSPACE, err);
2426146040Stjr      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2427146040Stjr	return NULL;
2428146040Stjr      break;
2429146040Stjr    case OP_ALT:
2430146040Stjr    case END_OF_RE:
2431146040Stjr      return NULL;
2432146040Stjr    case BACK_SLASH:
2433146040Stjr      *err = REG_EESCAPE;
2434146040Stjr      return NULL;
2435146040Stjr    default:
2436146040Stjr      /* Must not happen?  */
2437146040Stjr#ifdef DEBUG
2438146040Stjr      assert (0);
2439146040Stjr#endif
2440146040Stjr      return NULL;
2441146040Stjr    }
2442146040Stjr  fetch_token (token, regexp, syntax);
2443146040Stjr
2444146040Stjr  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2445146040Stjr	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2446146040Stjr    {
2447146040Stjr      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2448146040Stjr      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2449146040Stjr	return NULL;
2450146040Stjr      /* In BRE consecutive duplications are not allowed.  */
2451146040Stjr      if ((syntax & RE_CONTEXT_INVALID_DUP)
2452146040Stjr	  && (token->type == OP_DUP_ASTERISK
2453146040Stjr	      || token->type == OP_OPEN_DUP_NUM))
2454146040Stjr	{
2455146040Stjr	  *err = REG_BADRPT;
2456146040Stjr	  return NULL;
2457146040Stjr	}
2458146040Stjr    }
2459146040Stjr
2460146040Stjr  return tree;
2461146040Stjr}
2462146040Stjr
2463146040Stjr/* This function build the following tree, from regular expression
2464146040Stjr   (<reg_exp>):
2465146040Stjr	 SUBEXP
2466146040Stjr	    |
2467146040Stjr	<reg_exp>
2468146040Stjr*/
2469146040Stjr
2470146040Stjrstatic bin_tree_t *
2471146040Stjrparse_sub_exp (regexp, preg, token, syntax, nest, err)
2472146040Stjr     re_string_t *regexp;
2473146040Stjr     regex_t *preg;
2474146040Stjr     re_token_t *token;
2475146040Stjr     reg_syntax_t syntax;
2476146040Stjr     int nest;
2477146040Stjr     reg_errcode_t *err;
2478146040Stjr{
2479146040Stjr  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2480146040Stjr  bin_tree_t *tree;
2481146040Stjr  size_t cur_nsub;
2482146040Stjr  cur_nsub = preg->re_nsub++;
2483146040Stjr
2484146040Stjr  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2485146040Stjr
2486146040Stjr  /* The subexpression may be a null string.  */
2487146040Stjr  if (token->type == OP_CLOSE_SUBEXP)
2488146040Stjr    tree = NULL;
2489146040Stjr  else
2490146040Stjr    {
2491146040Stjr      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2492146040Stjr      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2493146040Stjr        *err = REG_EPAREN;
2494146040Stjr      if (BE (*err != REG_NOERROR, 0))
2495146040Stjr	return NULL;
2496146040Stjr    }
2497146040Stjr  dfa->completed_bkref_map |= 1 << cur_nsub;
2498146040Stjr
2499146040Stjr  tree = create_tree (dfa, tree, NULL, SUBEXP);
2500146040Stjr  if (BE (tree == NULL, 0))
2501146040Stjr    {
2502146040Stjr      *err = REG_ESPACE;
2503146040Stjr      return NULL;
2504146040Stjr    }
2505146040Stjr  tree->token.opr.idx = cur_nsub;
2506146040Stjr  return tree;
2507146040Stjr}
2508146040Stjr
2509146040Stjr/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2510146040Stjr
2511146040Stjrstatic bin_tree_t *
2512146040Stjrparse_dup_op (elem, regexp, dfa, token, syntax, err)
2513146040Stjr     bin_tree_t *elem;
2514146040Stjr     re_string_t *regexp;
2515146040Stjr     re_dfa_t *dfa;
2516146040Stjr     re_token_t *token;
2517146040Stjr     reg_syntax_t syntax;
2518146040Stjr     reg_errcode_t *err;
2519146040Stjr{
2520146040Stjr  bin_tree_t *tree = NULL, *old_tree = NULL;
2521146040Stjr  int i, start, end, start_idx = re_string_cur_idx (regexp);
2522146040Stjr  re_token_t start_token = *token;
2523146040Stjr
2524146040Stjr  if (token->type == OP_OPEN_DUP_NUM)
2525146040Stjr    {
2526146040Stjr      end = 0;
2527146040Stjr      start = fetch_number (regexp, token, syntax);
2528146040Stjr      if (start == -1)
2529146040Stjr	{
2530146040Stjr	  if (token->type == CHARACTER && token->opr.c == ',')
2531146040Stjr	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2532146040Stjr	  else
2533146040Stjr	    {
2534146040Stjr	      *err = REG_BADBR; /* <re>{} is invalid.  */
2535146040Stjr	      return NULL;
2536146040Stjr	    }
2537146040Stjr	}
2538146040Stjr      if (BE (start != -2, 1))
2539146040Stjr	{
2540146040Stjr	  /* We treat "{n}" as "{n,n}".  */
2541146040Stjr	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2542146040Stjr		 : ((token->type == CHARACTER && token->opr.c == ',')
2543146040Stjr		    ? fetch_number (regexp, token, syntax) : -2));
2544146040Stjr	}
2545146040Stjr      if (BE (start == -2 || end == -2, 0))
2546146040Stjr	{
2547146040Stjr	  /* Invalid sequence.  */
2548146040Stjr	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2549146040Stjr	    {
2550146040Stjr	      if (token->type == END_OF_RE)
2551146040Stjr		*err = REG_EBRACE;
2552146040Stjr	      else
2553146040Stjr		*err = REG_BADBR;
2554146040Stjr
2555146040Stjr	      return NULL;
2556146040Stjr	    }
2557146040Stjr
2558146040Stjr	  /* If the syntax bit is set, rollback.  */
2559146040Stjr	  re_string_set_index (regexp, start_idx);
2560146040Stjr	  *token = start_token;
2561146040Stjr	  token->type = CHARACTER;
2562146040Stjr	  /* mb_partial and word_char bits should be already initialized by
2563146040Stjr	     peek_token.  */
2564146040Stjr	  return elem;
2565146040Stjr	}
2566146040Stjr
2567146040Stjr      if (BE (end != -1 && start > end, 0))
2568146040Stjr	{
2569146040Stjr	  /* First number greater than second.  */
2570146040Stjr	  *err = REG_BADBR;
2571146040Stjr	  return NULL;
2572146040Stjr	}
2573146040Stjr    }
2574146040Stjr  else
2575146040Stjr    {
2576146040Stjr      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2577146040Stjr      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2578146040Stjr    }
2579146040Stjr
2580146040Stjr  fetch_token (token, regexp, syntax);
2581146040Stjr
2582146040Stjr  if (BE (elem == NULL, 0))
2583146040Stjr    return NULL;
2584146040Stjr  if (BE (start == 0 && end == 0, 0))
2585146040Stjr    {
2586146040Stjr      postorder (elem, free_tree, NULL);
2587146040Stjr      return NULL;
2588146040Stjr    }
2589146040Stjr
2590146040Stjr  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2591146040Stjr  if (BE (start > 0, 0))
2592146040Stjr    {
2593146040Stjr      tree = elem;
2594146040Stjr      for (i = 2; i <= start; ++i)
2595146040Stjr	{
2596146040Stjr	  elem = duplicate_tree (elem, dfa);
2597146040Stjr	  tree = create_tree (dfa, tree, elem, CONCAT);
2598146040Stjr	  if (BE (elem == NULL || tree == NULL, 0))
2599146040Stjr	    goto parse_dup_op_espace;
2600146040Stjr	}
2601146040Stjr
2602146040Stjr      if (start == end)
2603146040Stjr	return tree;
2604146040Stjr
2605146040Stjr      /* Duplicate ELEM before it is marked optional.  */
2606146040Stjr      elem = duplicate_tree (elem, dfa);
2607146040Stjr      old_tree = tree;
2608146040Stjr    }
2609146040Stjr  else
2610146040Stjr    old_tree = NULL;
2611146040Stjr
2612146040Stjr  if (elem->token.type == SUBEXP)
2613146040Stjr    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2614146040Stjr
2615146040Stjr  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2616146040Stjr  if (BE (tree == NULL, 0))
2617146040Stjr    goto parse_dup_op_espace;
2618146040Stjr
2619146040Stjr  /* This loop is actually executed only when end != -1,
2620146040Stjr     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2621146040Stjr     already created the start+1-th copy.  */
2622146040Stjr  for (i = start + 2; i <= end; ++i)
2623146040Stjr    {
2624146040Stjr      elem = duplicate_tree (elem, dfa);
2625146040Stjr      tree = create_tree (dfa, tree, elem, CONCAT);
2626146040Stjr      if (BE (elem == NULL || tree == NULL, 0))
2627146040Stjr        goto parse_dup_op_espace;
2628146040Stjr
2629146040Stjr      tree = create_tree (dfa, tree, NULL, OP_ALT);
2630146040Stjr      if (BE (tree == NULL, 0))
2631146040Stjr        goto parse_dup_op_espace;
2632146040Stjr    }
2633146040Stjr
2634146040Stjr  if (old_tree)
2635146040Stjr    tree = create_tree (dfa, old_tree, tree, CONCAT);
2636146040Stjr
2637146040Stjr  return tree;
2638146040Stjr
2639146040Stjr parse_dup_op_espace:
2640146040Stjr  *err = REG_ESPACE;
2641146040Stjr  return NULL;
2642146040Stjr}
2643146040Stjr
2644146040Stjr/* Size of the names for collating symbol/equivalence_class/character_class.
2645146040Stjr   I'm not sure, but maybe enough.  */
2646146040Stjr#define BRACKET_NAME_BUF_SIZE 32
2647146040Stjr
2648146040Stjr#ifndef _LIBC
2649146040Stjr  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2650146040Stjr     Build the range expression which starts from START_ELEM, and ends
2651146040Stjr     at END_ELEM.  The result are written to MBCSET and SBCSET.
2652146040Stjr     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2653146040Stjr     mbcset->range_ends, is a pointer argument sinse we may
2654146040Stjr     update it.  */
2655146040Stjr
2656146040Stjrstatic reg_errcode_t
2657146040Stjr# ifdef RE_ENABLE_I18N
2658146040Stjrbuild_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2659146040Stjr     re_charset_t *mbcset;
2660146040Stjr     int *range_alloc;
2661146040Stjr# else /* not RE_ENABLE_I18N */
2662146040Stjrbuild_range_exp (sbcset, start_elem, end_elem)
2663146040Stjr# endif /* not RE_ENABLE_I18N */
2664146040Stjr     re_bitset_ptr_t sbcset;
2665146040Stjr     bracket_elem_t *start_elem, *end_elem;
2666146040Stjr{
2667146040Stjr  unsigned int start_ch, end_ch;
2668146040Stjr  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2669146040Stjr  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2670146040Stjr	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2671146040Stjr	  0))
2672146040Stjr    return REG_ERANGE;
2673146040Stjr
2674146040Stjr  /* We can handle no multi character collating elements without libc
2675146040Stjr     support.  */
2676146040Stjr  if (BE ((start_elem->type == COLL_SYM
2677146040Stjr	   && strlen ((char *) start_elem->opr.name) > 1)
2678146040Stjr	  || (end_elem->type == COLL_SYM
2679146040Stjr	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2680146040Stjr    return REG_ECOLLATE;
2681146040Stjr
2682146040Stjr# ifdef RE_ENABLE_I18N
2683146040Stjr  {
2684146040Stjr    wchar_t wc, start_wc, end_wc;
2685146040Stjr    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2686146040Stjr
2687146040Stjr    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2688146040Stjr		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2689146040Stjr		   : 0));
2690146040Stjr    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2691146040Stjr	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2692146040Stjr		 : 0));
2693146040Stjr    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2694146040Stjr		? __btowc (start_ch) : start_elem->opr.wch);
2695146040Stjr    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2696146040Stjr	      ? __btowc (end_ch) : end_elem->opr.wch);
2697146040Stjr    if (start_wc == WEOF || end_wc == WEOF)
2698146040Stjr      return REG_ECOLLATE;
2699146040Stjr    cmp_buf[0] = start_wc;
2700146040Stjr    cmp_buf[4] = end_wc;
2701146040Stjr    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2702146040Stjr      return REG_ERANGE;
2703146040Stjr
2704146040Stjr    /* Got valid collation sequence values, add them as a new entry.
2705146040Stjr       However, for !_LIBC we have no collation elements: if the
2706146040Stjr       character set is single byte, the single byte character set
2707146040Stjr       that we build below suffices.  parse_bracket_exp passes
2708146040Stjr       no MBCSET if dfa->mb_cur_max == 1.  */
2709146040Stjr    if (mbcset)
2710146040Stjr      {
2711146040Stjr        /* Check the space of the arrays.  */
2712146040Stjr        if (BE (*range_alloc == mbcset->nranges, 0))
2713146040Stjr          {
2714146040Stjr	    /* There is not enough space, need realloc.  */
2715146040Stjr	    wchar_t *new_array_start, *new_array_end;
2716146040Stjr	    int new_nranges;
2717146040Stjr
2718146040Stjr	    /* +1 in case of mbcset->nranges is 0.  */
2719146040Stjr	    new_nranges = 2 * mbcset->nranges + 1;
2720146040Stjr	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2721146040Stjr	       are NULL if *range_alloc == 0.  */
2722146040Stjr	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2723146040Stjr				          new_nranges);
2724146040Stjr	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2725146040Stjr				        new_nranges);
2726146040Stjr
2727146040Stjr	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2728146040Stjr	      return REG_ESPACE;
2729146040Stjr
2730146040Stjr	    mbcset->range_starts = new_array_start;
2731146040Stjr	    mbcset->range_ends = new_array_end;
2732146040Stjr	    *range_alloc = new_nranges;
2733146040Stjr          }
2734146040Stjr
2735146040Stjr        mbcset->range_starts[mbcset->nranges] = start_wc;
2736146040Stjr        mbcset->range_ends[mbcset->nranges++] = end_wc;
2737146040Stjr      }
2738146040Stjr
2739146040Stjr    /* Build the table for single byte characters.  */
2740146040Stjr    for (wc = 0; wc < SBC_MAX; ++wc)
2741146040Stjr      {
2742146040Stjr	cmp_buf[2] = wc;
2743146040Stjr	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2744146040Stjr	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2745146040Stjr	  bitset_set (sbcset, wc);
2746146040Stjr      }
2747146040Stjr  }
2748146040Stjr# else /* not RE_ENABLE_I18N */
2749146040Stjr  {
2750146040Stjr    unsigned int ch;
2751146040Stjr    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2752146040Stjr		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2753146040Stjr		   : 0));
2754146040Stjr    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2755146040Stjr	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2756146040Stjr		 : 0));
2757146040Stjr    if (start_ch > end_ch)
2758146040Stjr      return REG_ERANGE;
2759146040Stjr    /* Build the table for single byte characters.  */
2760146040Stjr    for (ch = 0; ch < SBC_MAX; ++ch)
2761146040Stjr      if (start_ch <= ch  && ch <= end_ch)
2762146040Stjr	bitset_set (sbcset, ch);
2763146040Stjr  }
2764146040Stjr# endif /* not RE_ENABLE_I18N */
2765146040Stjr  return REG_NOERROR;
2766146040Stjr}
2767146040Stjr#endif /* not _LIBC */
2768146040Stjr
2769146040Stjr#ifndef _LIBC
2770146040Stjr/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2771146040Stjr   Build the collating element which is represented by NAME.
2772146040Stjr   The result are written to MBCSET and SBCSET.
2773146040Stjr   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2774146040Stjr   pointer argument since we may update it.  */
2775146040Stjr
2776146040Stjrstatic reg_errcode_t
2777146040Stjr# ifdef RE_ENABLE_I18N
2778146040Stjrbuild_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2779146040Stjr     re_charset_t *mbcset;
2780146040Stjr     int *coll_sym_alloc;
2781146040Stjr# else /* not RE_ENABLE_I18N */
2782146040Stjrbuild_collating_symbol (sbcset, name)
2783146040Stjr# endif /* not RE_ENABLE_I18N */
2784146040Stjr     re_bitset_ptr_t sbcset;
2785146040Stjr     const unsigned char *name;
2786146040Stjr{
2787146040Stjr  size_t name_len = strlen ((const char *) name);
2788146040Stjr  if (BE (name_len != 1, 0))
2789146040Stjr    return REG_ECOLLATE;
2790146040Stjr  else
2791146040Stjr    {
2792146040Stjr      bitset_set (sbcset, name[0]);
2793146040Stjr      return REG_NOERROR;
2794146040Stjr    }
2795146040Stjr}
2796146040Stjr#endif /* not _LIBC */
2797146040Stjr
2798146040Stjr/* This function parse bracket expression like "[abc]", "[a-c]",
2799146040Stjr   "[[.a-a.]]" etc.  */
2800146040Stjr
2801146040Stjrstatic bin_tree_t *
2802146040Stjrparse_bracket_exp (regexp, dfa, token, syntax, err)
2803146040Stjr     re_string_t *regexp;
2804146040Stjr     re_dfa_t *dfa;
2805146040Stjr     re_token_t *token;
2806146040Stjr     reg_syntax_t syntax;
2807146040Stjr     reg_errcode_t *err;
2808146040Stjr{
2809146040Stjr#ifdef _LIBC
2810146040Stjr  const unsigned char *collseqmb;
2811146040Stjr  const char *collseqwc;
2812146040Stjr  uint32_t nrules;
2813146040Stjr  int32_t table_size;
2814146040Stjr  const int32_t *symb_table;
2815146040Stjr  const unsigned char *extra;
2816146040Stjr
2817146040Stjr  /* Local function for parse_bracket_exp used in _LIBC environement.
2818146040Stjr     Seek the collating symbol entry correspondings to NAME.
2819146040Stjr     Return the index of the symbol in the SYMB_TABLE.  */
2820146040Stjr
2821146040Stjr  auto inline int32_t
2822146040Stjr  __attribute ((always_inline))
2823146040Stjr  seek_collating_symbol_entry (name, name_len)
2824146040Stjr	 const unsigned char *name;
2825146040Stjr	 size_t name_len;
2826146040Stjr    {
2827146040Stjr      int32_t hash = elem_hash ((const char *) name, name_len);
2828146040Stjr      int32_t elem = hash % table_size;
2829146040Stjr      int32_t second = hash % (table_size - 2);
2830146040Stjr      while (symb_table[2 * elem] != 0)
2831146040Stjr	{
2832146040Stjr	  /* First compare the hashing value.  */
2833146040Stjr	  if (symb_table[2 * elem] == hash
2834146040Stjr	      /* Compare the length of the name.  */
2835146040Stjr	      && name_len == extra[symb_table[2 * elem + 1]]
2836146040Stjr	      /* Compare the name.  */
2837146040Stjr	      && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2838146040Stjr			 name_len) == 0)
2839146040Stjr	    {
2840146040Stjr	      /* Yep, this is the entry.  */
2841146040Stjr	      break;
2842146040Stjr	    }
2843146040Stjr
2844146040Stjr	  /* Next entry.  */
2845146040Stjr	  elem += second;
2846146040Stjr	}
2847146040Stjr      return elem;
2848146040Stjr    }
2849146040Stjr
2850146040Stjr  /* Local function for parse_bracket_exp used in _LIBC environement.
2851146040Stjr     Look up the collation sequence value of BR_ELEM.
2852146040Stjr     Return the value if succeeded, UINT_MAX otherwise.  */
2853146040Stjr
2854146040Stjr  auto inline unsigned int
2855146040Stjr  __attribute ((always_inline))
2856146040Stjr  lookup_collation_sequence_value (br_elem)
2857146040Stjr	 bracket_elem_t *br_elem;
2858146040Stjr    {
2859146040Stjr      if (br_elem->type == SB_CHAR)
2860146040Stjr	{
2861146040Stjr	  /*
2862146040Stjr	  if (MB_CUR_MAX == 1)
2863146040Stjr	  */
2864146040Stjr	  if (nrules == 0)
2865146040Stjr	    return collseqmb[br_elem->opr.ch];
2866146040Stjr	  else
2867146040Stjr	    {
2868146040Stjr	      wint_t wc = __btowc (br_elem->opr.ch);
2869146040Stjr	      return __collseq_table_lookup (collseqwc, wc);
2870146040Stjr	    }
2871146040Stjr	}
2872146040Stjr      else if (br_elem->type == MB_CHAR)
2873146040Stjr	{
2874146040Stjr	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2875146040Stjr	}
2876146040Stjr      else if (br_elem->type == COLL_SYM)
2877146040Stjr	{
2878146040Stjr	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2879146040Stjr	  if (nrules != 0)
2880146040Stjr	    {
2881146040Stjr	      int32_t elem, idx;
2882146040Stjr	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2883146040Stjr						  sym_name_len);
2884146040Stjr	      if (symb_table[2 * elem] != 0)
2885146040Stjr		{
2886146040Stjr		  /* We found the entry.  */
2887146040Stjr		  idx = symb_table[2 * elem + 1];
2888146040Stjr		  /* Skip the name of collating element name.  */
2889146040Stjr		  idx += 1 + extra[idx];
2890146040Stjr		  /* Skip the byte sequence of the collating element.  */
2891146040Stjr		  idx += 1 + extra[idx];
2892146040Stjr		  /* Adjust for the alignment.  */
2893146040Stjr		  idx = (idx + 3) & ~3;
2894146040Stjr		  /* Skip the multibyte collation sequence value.  */
2895146040Stjr		  idx += sizeof (unsigned int);
2896146040Stjr		  /* Skip the wide char sequence of the collating element.  */
2897146040Stjr		  idx += sizeof (unsigned int) *
2898146040Stjr		    (1 + *(unsigned int *) (extra + idx));
2899146040Stjr		  /* Return the collation sequence value.  */
2900146040Stjr		  return *(unsigned int *) (extra + idx);
2901146040Stjr		}
2902146040Stjr	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2903146040Stjr		{
2904146040Stjr		  /* No valid character.  Match it as a single byte
2905146040Stjr		     character.  */
2906146040Stjr		  return collseqmb[br_elem->opr.name[0]];
2907146040Stjr		}
2908146040Stjr	    }
2909146040Stjr	  else if (sym_name_len == 1)
2910146040Stjr	    return collseqmb[br_elem->opr.name[0]];
2911146040Stjr	}
2912146040Stjr      return UINT_MAX;
2913146040Stjr    }
2914146040Stjr
2915146040Stjr  /* Local function for parse_bracket_exp used in _LIBC environement.
2916146040Stjr     Build the range expression which starts from START_ELEM, and ends
2917146040Stjr     at END_ELEM.  The result are written to MBCSET and SBCSET.
2918146040Stjr     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2919146040Stjr     mbcset->range_ends, is a pointer argument sinse we may
2920146040Stjr     update it.  */
2921146040Stjr
2922146040Stjr  auto inline reg_errcode_t
2923146040Stjr  __attribute ((always_inline))
2924146040Stjr  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2925146040Stjr	 re_charset_t *mbcset;
2926146040Stjr	 int *range_alloc;
2927146040Stjr	 re_bitset_ptr_t sbcset;
2928146040Stjr	 bracket_elem_t *start_elem, *end_elem;
2929146040Stjr    {
2930146040Stjr      unsigned int ch;
2931146040Stjr      uint32_t start_collseq;
2932146040Stjr      uint32_t end_collseq;
2933146040Stjr
2934146040Stjr      /* Equivalence Classes and Character Classes can't be a range
2935146040Stjr	 start/end.  */
2936146040Stjr      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2937146040Stjr	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2938146040Stjr	      0))
2939146040Stjr	return REG_ERANGE;
2940146040Stjr
2941146040Stjr      start_collseq = lookup_collation_sequence_value (start_elem);
2942146040Stjr      end_collseq = lookup_collation_sequence_value (end_elem);
2943146040Stjr      /* Check start/end collation sequence values.  */
2944146040Stjr      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2945146040Stjr	return REG_ECOLLATE;
2946146040Stjr      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2947146040Stjr	return REG_ERANGE;
2948146040Stjr
2949146040Stjr      /* Got valid collation sequence values, add them as a new entry.
2950146040Stjr	 However, if we have no collation elements, and the character set
2951146040Stjr	 is single byte, the single byte character set that we
2952146040Stjr	 build below suffices. */
2953146040Stjr      if (nrules > 0 || dfa->mb_cur_max > 1)
2954146040Stjr	{
2955146040Stjr          /* Check the space of the arrays.  */
2956146040Stjr          if (BE (*range_alloc == mbcset->nranges, 0))
2957146040Stjr	    {
2958146040Stjr	      /* There is not enough space, need realloc.  */
2959146040Stjr	      uint32_t *new_array_start;
2960146040Stjr	      uint32_t *new_array_end;
2961146040Stjr	      int new_nranges;
2962146040Stjr
2963146040Stjr	      /* +1 in case of mbcset->nranges is 0.  */
2964146040Stjr	      new_nranges = 2 * mbcset->nranges + 1;
2965146040Stjr	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2966146040Stjr					    new_nranges);
2967146040Stjr	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2968146040Stjr				          new_nranges);
2969146040Stjr
2970146040Stjr	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2971146040Stjr	        return REG_ESPACE;
2972146040Stjr
2973146040Stjr	      mbcset->range_starts = new_array_start;
2974146040Stjr	      mbcset->range_ends = new_array_end;
2975146040Stjr	      *range_alloc = new_nranges;
2976146040Stjr	    }
2977146040Stjr
2978146040Stjr          mbcset->range_starts[mbcset->nranges] = start_collseq;
2979146040Stjr          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2980146040Stjr	}
2981146040Stjr
2982146040Stjr      /* Build the table for single byte characters.  */
2983146040Stjr      for (ch = 0; ch < SBC_MAX; ch++)
2984146040Stjr	{
2985146040Stjr	  uint32_t ch_collseq;
2986146040Stjr	  /*
2987146040Stjr	  if (MB_CUR_MAX == 1)
2988146040Stjr	  */
2989146040Stjr	  if (nrules == 0)
2990146040Stjr	    ch_collseq = collseqmb[ch];
2991146040Stjr	  else
2992146040Stjr	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2993146040Stjr	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2994146040Stjr	    bitset_set (sbcset, ch);
2995146040Stjr	}
2996146040Stjr      return REG_NOERROR;
2997146040Stjr    }
2998146040Stjr
2999146040Stjr  /* Local function for parse_bracket_exp used in _LIBC environement.
3000146040Stjr     Build the collating element which is represented by NAME.
3001146040Stjr     The result are written to MBCSET and SBCSET.
3002146040Stjr     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3003146040Stjr     pointer argument sinse we may update it.  */
3004146040Stjr
3005146040Stjr  auto inline reg_errcode_t
3006146040Stjr  __attribute ((always_inline))
3007146040Stjr  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3008146040Stjr	 re_charset_t *mbcset;
3009146040Stjr	 int *coll_sym_alloc;
3010146040Stjr	 re_bitset_ptr_t sbcset;
3011146040Stjr	 const unsigned char *name;
3012146040Stjr    {
3013146040Stjr      int32_t elem, idx;
3014146040Stjr      size_t name_len = strlen ((const char *) name);
3015146040Stjr      if (nrules != 0)
3016146040Stjr	{
3017146040Stjr	  elem = seek_collating_symbol_entry (name, name_len);
3018146040Stjr	  if (symb_table[2 * elem] != 0)
3019146040Stjr	    {
3020146040Stjr	      /* We found the entry.  */
3021146040Stjr	      idx = symb_table[2 * elem + 1];
3022146040Stjr	      /* Skip the name of collating element name.  */
3023146040Stjr	      idx += 1 + extra[idx];
3024146040Stjr	    }
3025146040Stjr	  else if (symb_table[2 * elem] == 0 && name_len == 1)
3026146040Stjr	    {
3027146040Stjr	      /* No valid character, treat it as a normal
3028146040Stjr		 character.  */
3029146040Stjr	      bitset_set (sbcset, name[0]);
3030146040Stjr	      return REG_NOERROR;
3031146040Stjr	    }
3032146040Stjr	  else
3033146040Stjr	    return REG_ECOLLATE;
3034146040Stjr
3035146040Stjr	  /* Got valid collation sequence, add it as a new entry.  */
3036146040Stjr	  /* Check the space of the arrays.  */
3037146040Stjr	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3038146040Stjr	    {
3039146040Stjr	      /* Not enough, realloc it.  */
3040146040Stjr	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3041146040Stjr	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3042146040Stjr	      /* Use realloc since mbcset->coll_syms is NULL
3043146040Stjr		 if *alloc == 0.  */
3044146040Stjr	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3045146040Stjr						   new_coll_sym_alloc);
3046146040Stjr	      if (BE (new_coll_syms == NULL, 0))
3047146040Stjr		return REG_ESPACE;
3048146040Stjr	      mbcset->coll_syms = new_coll_syms;
3049146040Stjr	      *coll_sym_alloc = new_coll_sym_alloc;
3050146040Stjr	    }
3051146040Stjr	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3052146040Stjr	  return REG_NOERROR;
3053146040Stjr	}
3054146040Stjr      else
3055146040Stjr	{
3056146040Stjr	  if (BE (name_len != 1, 0))
3057146040Stjr	    return REG_ECOLLATE;
3058146040Stjr	  else
3059146040Stjr	    {
3060146040Stjr	      bitset_set (sbcset, name[0]);
3061146040Stjr	      return REG_NOERROR;
3062146040Stjr	    }
3063146040Stjr	}
3064146040Stjr    }
3065146040Stjr#endif
3066146040Stjr
3067146040Stjr  re_token_t br_token;
3068146040Stjr  re_bitset_ptr_t sbcset;
3069146040Stjr#ifdef RE_ENABLE_I18N
3070146040Stjr  re_charset_t *mbcset;
3071146040Stjr  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3072146040Stjr  int equiv_class_alloc = 0, char_class_alloc = 0;
3073146040Stjr#endif /* not RE_ENABLE_I18N */
3074146040Stjr  int non_match = 0;
3075146040Stjr  bin_tree_t *work_tree;
3076146040Stjr  int token_len;
3077146040Stjr  int first_round = 1;
3078146040Stjr#ifdef _LIBC
3079146040Stjr  collseqmb = (const unsigned char *)
3080146040Stjr    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3081146040Stjr  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3082146040Stjr  if (nrules)
3083146040Stjr    {
3084146040Stjr      /*
3085146040Stjr      if (MB_CUR_MAX > 1)
3086146040Stjr      */
3087146040Stjr	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3088146040Stjr      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3089146040Stjr      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3090146040Stjr						  _NL_COLLATE_SYMB_TABLEMB);
3091146040Stjr      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3092146040Stjr						   _NL_COLLATE_SYMB_EXTRAMB);
3093146040Stjr    }
3094146040Stjr#endif
3095146040Stjr  sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3096146040Stjr#ifdef RE_ENABLE_I18N
3097146040Stjr  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3098146040Stjr#endif /* RE_ENABLE_I18N */
3099146040Stjr#ifdef RE_ENABLE_I18N
3100146040Stjr  if (BE (sbcset == NULL || mbcset == NULL, 0))
3101146040Stjr#else
3102146040Stjr  if (BE (sbcset == NULL, 0))
3103146040Stjr#endif /* RE_ENABLE_I18N */
3104146040Stjr    {
3105146040Stjr      *err = REG_ESPACE;
3106146040Stjr      return NULL;
3107146040Stjr    }
3108146040Stjr
3109146040Stjr  token_len = peek_token_bracket (token, regexp, syntax);
3110146040Stjr  if (BE (token->type == END_OF_RE, 0))
3111146040Stjr    {
3112146040Stjr      *err = REG_BADPAT;
3113146040Stjr      goto parse_bracket_exp_free_return;
3114146040Stjr    }
3115146040Stjr  if (token->type == OP_NON_MATCH_LIST)
3116146040Stjr    {
3117146040Stjr#ifdef RE_ENABLE_I18N
3118146040Stjr      mbcset->non_match = 1;
3119146040Stjr#endif /* not RE_ENABLE_I18N */
3120146040Stjr      non_match = 1;
3121146040Stjr      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3122146040Stjr	bitset_set (sbcset, '\0');
3123146040Stjr      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3124146040Stjr      token_len = peek_token_bracket (token, regexp, syntax);
3125146040Stjr      if (BE (token->type == END_OF_RE, 0))
3126146040Stjr	{
3127146040Stjr	  *err = REG_BADPAT;
3128146040Stjr	  goto parse_bracket_exp_free_return;
3129146040Stjr	}
3130146040Stjr    }
3131146040Stjr
3132146040Stjr  /* We treat the first ']' as a normal character.  */
3133146040Stjr  if (token->type == OP_CLOSE_BRACKET)
3134146040Stjr    token->type = CHARACTER;
3135146040Stjr
3136146040Stjr  while (1)
3137146040Stjr    {
3138146040Stjr      bracket_elem_t start_elem, end_elem;
3139146040Stjr      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3140146040Stjr      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3141146040Stjr      reg_errcode_t ret;
3142146040Stjr      int token_len2 = 0, is_range_exp = 0;
3143146040Stjr      re_token_t token2;
3144146040Stjr
3145146040Stjr      start_elem.opr.name = start_name_buf;
3146146040Stjr      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3147146040Stjr				   syntax, first_round);
3148146040Stjr      if (BE (ret != REG_NOERROR, 0))
3149146040Stjr	{
3150146040Stjr	  *err = ret;
3151146040Stjr	  goto parse_bracket_exp_free_return;
3152146040Stjr	}
3153146040Stjr      first_round = 0;
3154146040Stjr
3155146040Stjr      /* Get information about the next token.  We need it in any case.  */
3156146040Stjr      token_len = peek_token_bracket (token, regexp, syntax);
3157146040Stjr
3158146040Stjr      /* Do not check for ranges if we know they are not allowed.  */
3159146040Stjr      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3160146040Stjr	{
3161146040Stjr	  if (BE (token->type == END_OF_RE, 0))
3162146040Stjr	    {
3163146040Stjr	      *err = REG_EBRACK;
3164146040Stjr	      goto parse_bracket_exp_free_return;
3165146040Stjr	    }
3166146040Stjr	  if (token->type == OP_CHARSET_RANGE)
3167146040Stjr	    {
3168146040Stjr	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3169146040Stjr	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3170146040Stjr	      if (BE (token2.type == END_OF_RE, 0))
3171146040Stjr		{
3172146040Stjr		  *err = REG_EBRACK;
3173146040Stjr		  goto parse_bracket_exp_free_return;
3174146040Stjr		}
3175146040Stjr	      if (token2.type == OP_CLOSE_BRACKET)
3176146040Stjr		{
3177146040Stjr		  /* We treat the last '-' as a normal character.  */
3178146040Stjr		  re_string_skip_bytes (regexp, -token_len);
3179146040Stjr		  token->type = CHARACTER;
3180146040Stjr		}
3181146040Stjr	      else
3182146040Stjr		is_range_exp = 1;
3183146040Stjr	    }
3184146040Stjr	}
3185146040Stjr
3186146040Stjr      if (is_range_exp == 1)
3187146040Stjr	{
3188146040Stjr	  end_elem.opr.name = end_name_buf;
3189146040Stjr	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3190146040Stjr				       dfa, syntax, 1);
3191146040Stjr	  if (BE (ret != REG_NOERROR, 0))
3192146040Stjr	    {
3193146040Stjr	      *err = ret;
3194146040Stjr	      goto parse_bracket_exp_free_return;
3195146040Stjr	    }
3196146040Stjr
3197146040Stjr	  token_len = peek_token_bracket (token, regexp, syntax);
3198146040Stjr
3199146040Stjr#ifdef _LIBC
3200146040Stjr	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3201146040Stjr				  &start_elem, &end_elem);
3202146040Stjr#else
3203146040Stjr# ifdef RE_ENABLE_I18N
3204146040Stjr	  *err = build_range_exp (sbcset,
3205146040Stjr				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3206146040Stjr				  &range_alloc, &start_elem, &end_elem);
3207146040Stjr# else
3208146040Stjr	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3209146040Stjr# endif
3210146040Stjr#endif /* RE_ENABLE_I18N */
3211146040Stjr	  if (BE (*err != REG_NOERROR, 0))
3212146040Stjr	    goto parse_bracket_exp_free_return;
3213146040Stjr	}
3214146040Stjr      else
3215146040Stjr	{
3216146040Stjr	  switch (start_elem.type)
3217146040Stjr	    {
3218146040Stjr	    case SB_CHAR:
3219146040Stjr	      bitset_set (sbcset, start_elem.opr.ch);
3220146040Stjr	      break;
3221146040Stjr#ifdef RE_ENABLE_I18N
3222146040Stjr	    case MB_CHAR:
3223146040Stjr	      /* Check whether the array has enough space.  */
3224146040Stjr	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3225146040Stjr		{
3226146040Stjr		  wchar_t *new_mbchars;
3227146040Stjr		  /* Not enough, realloc it.  */
3228146040Stjr		  /* +1 in case of mbcset->nmbchars is 0.  */
3229146040Stjr		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3230146040Stjr		  /* Use realloc since array is NULL if *alloc == 0.  */
3231146040Stjr		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3232146040Stjr					    mbchar_alloc);
3233146040Stjr		  if (BE (new_mbchars == NULL, 0))
3234146040Stjr		    goto parse_bracket_exp_espace;
3235146040Stjr		  mbcset->mbchars = new_mbchars;
3236146040Stjr		}
3237146040Stjr	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3238146040Stjr	      break;
3239146040Stjr#endif /* RE_ENABLE_I18N */
3240146040Stjr	    case EQUIV_CLASS:
3241146040Stjr	      *err = build_equiv_class (sbcset,
3242146040Stjr#ifdef RE_ENABLE_I18N
3243146040Stjr					mbcset, &equiv_class_alloc,
3244146040Stjr#endif /* RE_ENABLE_I18N */
3245146040Stjr					start_elem.opr.name);
3246146040Stjr	      if (BE (*err != REG_NOERROR, 0))
3247146040Stjr		goto parse_bracket_exp_free_return;
3248146040Stjr	      break;
3249146040Stjr	    case COLL_SYM:
3250146040Stjr	      *err = build_collating_symbol (sbcset,
3251146040Stjr#ifdef RE_ENABLE_I18N
3252146040Stjr					     mbcset, &coll_sym_alloc,
3253146040Stjr#endif /* RE_ENABLE_I18N */
3254146040Stjr					     start_elem.opr.name);
3255146040Stjr	      if (BE (*err != REG_NOERROR, 0))
3256146040Stjr		goto parse_bracket_exp_free_return;
3257146040Stjr	      break;
3258146040Stjr	    case CHAR_CLASS:
3259146040Stjr	      *err = build_charclass (regexp->trans, sbcset,
3260146040Stjr#ifdef RE_ENABLE_I18N
3261146040Stjr				      mbcset, &char_class_alloc,
3262146040Stjr#endif /* RE_ENABLE_I18N */
3263146040Stjr				      start_elem.opr.name, syntax);
3264146040Stjr	      if (BE (*err != REG_NOERROR, 0))
3265146040Stjr	       goto parse_bracket_exp_free_return;
3266146040Stjr	      break;
3267146040Stjr	    default:
3268146040Stjr	      assert (0);
3269146040Stjr	      break;
3270146040Stjr	    }
3271146040Stjr	}
3272146040Stjr      if (BE (token->type == END_OF_RE, 0))
3273146040Stjr	{
3274146040Stjr	  *err = REG_EBRACK;
3275146040Stjr	  goto parse_bracket_exp_free_return;
3276146040Stjr	}
3277146040Stjr      if (token->type == OP_CLOSE_BRACKET)
3278146040Stjr	break;
3279146040Stjr    }
3280146040Stjr
3281146040Stjr  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3282146040Stjr
3283146040Stjr  /* If it is non-matching list.  */
3284146040Stjr  if (non_match)
3285146040Stjr    bitset_not (sbcset);
3286146040Stjr
3287146040Stjr#ifdef RE_ENABLE_I18N
3288146040Stjr  /* Ensure only single byte characters are set.  */
3289146040Stjr  if (dfa->mb_cur_max > 1)
3290146040Stjr    bitset_mask (sbcset, dfa->sb_char);
3291146040Stjr
3292146040Stjr  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3293146040Stjr      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3294146040Stjr						     || mbcset->non_match)))
3295146040Stjr    {
3296146040Stjr      bin_tree_t *mbc_tree;
3297146040Stjr      int sbc_idx;
3298146040Stjr      /* Build a tree for complex bracket.  */
3299146040Stjr      dfa->has_mb_node = 1;
3300146040Stjr      br_token.type = COMPLEX_BRACKET;
3301146040Stjr      br_token.opr.mbcset = mbcset;
3302146040Stjr      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3303146040Stjr      if (BE (mbc_tree == NULL, 0))
3304146040Stjr	goto parse_bracket_exp_espace;
3305146040Stjr      for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3306146040Stjr	if (sbcset[sbc_idx])
3307146040Stjr	  break;
3308146040Stjr      /* If there are no bits set in sbcset, there is no point
3309146040Stjr	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3310146040Stjr      if (sbc_idx < BITSET_UINTS)
3311146040Stjr	{
3312146040Stjr          /* Build a tree for simple bracket.  */
3313146040Stjr          br_token.type = SIMPLE_BRACKET;
3314146040Stjr          br_token.opr.sbcset = sbcset;
3315146040Stjr          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3316146040Stjr          if (BE (work_tree == NULL, 0))
3317146040Stjr            goto parse_bracket_exp_espace;
3318146040Stjr
3319146040Stjr          /* Then join them by ALT node.  */
3320146040Stjr          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3321146040Stjr          if (BE (work_tree == NULL, 0))
3322146040Stjr            goto parse_bracket_exp_espace;
3323146040Stjr	}
3324146040Stjr      else
3325146040Stjr	{
3326146040Stjr	  re_free (sbcset);
3327146040Stjr	  work_tree = mbc_tree;
3328146040Stjr	}
3329146040Stjr    }
3330146040Stjr  else
3331146040Stjr#endif /* not RE_ENABLE_I18N */
3332146040Stjr    {
3333146040Stjr#ifdef RE_ENABLE_I18N
3334146040Stjr      free_charset (mbcset);
3335146040Stjr#endif
3336146040Stjr      /* Build a tree for simple bracket.  */
3337146040Stjr      br_token.type = SIMPLE_BRACKET;
3338146040Stjr      br_token.opr.sbcset = sbcset;
3339146040Stjr      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3340146040Stjr      if (BE (work_tree == NULL, 0))
3341146040Stjr        goto parse_bracket_exp_espace;
3342146040Stjr    }
3343146040Stjr  return work_tree;
3344146040Stjr
3345146040Stjr parse_bracket_exp_espace:
3346146040Stjr  *err = REG_ESPACE;
3347146040Stjr parse_bracket_exp_free_return:
3348146040Stjr  re_free (sbcset);
3349146040Stjr#ifdef RE_ENABLE_I18N
3350146040Stjr  free_charset (mbcset);
3351146040Stjr#endif /* RE_ENABLE_I18N */
3352146040Stjr  return NULL;
3353146040Stjr}
3354146040Stjr
3355146040Stjr/* Parse an element in the bracket expression.  */
3356146040Stjr
3357146040Stjrstatic reg_errcode_t
3358146040Stjrparse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3359146040Stjr		       accept_hyphen)
3360146040Stjr     bracket_elem_t *elem;
3361146040Stjr     re_string_t *regexp;
3362146040Stjr     re_token_t *token;
3363146040Stjr     int token_len;
3364146040Stjr     re_dfa_t *dfa;
3365146040Stjr     reg_syntax_t syntax;
3366146040Stjr     int accept_hyphen;
3367146040Stjr{
3368146040Stjr#ifdef RE_ENABLE_I18N
3369146040Stjr  int cur_char_size;
3370146040Stjr  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3371146040Stjr  if (cur_char_size > 1)
3372146040Stjr    {
3373146040Stjr      elem->type = MB_CHAR;
3374146040Stjr      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3375146040Stjr      re_string_skip_bytes (regexp, cur_char_size);
3376146040Stjr      return REG_NOERROR;
3377146040Stjr    }
3378146040Stjr#endif /* RE_ENABLE_I18N */
3379146040Stjr  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3380146040Stjr  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3381146040Stjr      || token->type == OP_OPEN_EQUIV_CLASS)
3382146040Stjr    return parse_bracket_symbol (elem, regexp, token);
3383146040Stjr  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3384146040Stjr    {
3385146040Stjr      /* A '-' must only appear as anything but a range indicator before
3386146040Stjr	 the closing bracket.  Everything else is an error.  */
3387146040Stjr      re_token_t token2;
3388146040Stjr      (void) peek_token_bracket (&token2, regexp, syntax);
3389146040Stjr      if (token2.type != OP_CLOSE_BRACKET)
3390146040Stjr	/* The actual error value is not standardized since this whole
3391146040Stjr	   case is undefined.  But ERANGE makes good sense.  */
3392146040Stjr	return REG_ERANGE;
3393146040Stjr    }
3394146040Stjr  elem->type = SB_CHAR;
3395146040Stjr  elem->opr.ch = token->opr.c;
3396146040Stjr  return REG_NOERROR;
3397146040Stjr}
3398146040Stjr
3399146040Stjr/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3400146040Stjr   such as [:<character_class>:], [.<collating_element>.], and
3401146040Stjr   [=<equivalent_class>=].  */
3402146040Stjr
3403146040Stjrstatic reg_errcode_t
3404146040Stjrparse_bracket_symbol (elem, regexp, token)
3405146040Stjr     bracket_elem_t *elem;
3406146040Stjr     re_string_t *regexp;
3407146040Stjr     re_token_t *token;
3408146040Stjr{
3409146040Stjr  unsigned char ch, delim = token->opr.c;
3410146040Stjr  int i = 0;
3411146040Stjr  if (re_string_eoi(regexp))
3412146040Stjr    return REG_EBRACK;
3413146040Stjr  for (;; ++i)
3414146040Stjr    {
3415146040Stjr      if (i >= BRACKET_NAME_BUF_SIZE)
3416146040Stjr	return REG_EBRACK;
3417146040Stjr      if (token->type == OP_OPEN_CHAR_CLASS)
3418146040Stjr	ch = re_string_fetch_byte_case (regexp);
3419146040Stjr      else
3420146040Stjr	ch = re_string_fetch_byte (regexp);
3421146040Stjr      if (re_string_eoi(regexp))
3422146040Stjr	return REG_EBRACK;
3423146040Stjr      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3424146040Stjr	break;
3425146040Stjr      elem->opr.name[i] = ch;
3426146040Stjr    }
3427146040Stjr  re_string_skip_bytes (regexp, 1);
3428146040Stjr  elem->opr.name[i] = '\0';
3429146040Stjr  switch (token->type)
3430146040Stjr    {
3431146040Stjr    case OP_OPEN_COLL_ELEM:
3432146040Stjr      elem->type = COLL_SYM;
3433146040Stjr      break;
3434146040Stjr    case OP_OPEN_EQUIV_CLASS:
3435146040Stjr      elem->type = EQUIV_CLASS;
3436146040Stjr      break;
3437146040Stjr    case OP_OPEN_CHAR_CLASS:
3438146040Stjr      elem->type = CHAR_CLASS;
3439146040Stjr      break;
3440146040Stjr    default:
3441146040Stjr      break;
3442146040Stjr    }
3443146040Stjr  return REG_NOERROR;
3444146040Stjr}
3445146040Stjr
3446146040Stjr  /* Helper function for parse_bracket_exp.
3447146040Stjr     Build the equivalence class which is represented by NAME.
3448146040Stjr     The result are written to MBCSET and SBCSET.
3449146040Stjr     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3450146040Stjr     is a pointer argument sinse we may update it.  */
3451146040Stjr
3452146040Stjrstatic reg_errcode_t
3453146040Stjr#ifdef RE_ENABLE_I18N
3454146040Stjrbuild_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3455146040Stjr     re_charset_t *mbcset;
3456146040Stjr     int *equiv_class_alloc;
3457146040Stjr#else /* not RE_ENABLE_I18N */
3458146040Stjrbuild_equiv_class (sbcset, name)
3459146040Stjr#endif /* not RE_ENABLE_I18N */
3460146040Stjr     re_bitset_ptr_t sbcset;
3461146040Stjr     const unsigned char *name;
3462146040Stjr{
3463146040Stjr#if defined _LIBC
3464146040Stjr  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3465146040Stjr  if (nrules != 0)
3466146040Stjr    {
3467146040Stjr      const int32_t *table, *indirect;
3468146040Stjr      const unsigned char *weights, *extra, *cp;
3469146040Stjr      unsigned char char_buf[2];
3470146040Stjr      int32_t idx1, idx2;
3471146040Stjr      unsigned int ch;
3472146040Stjr      size_t len;
3473146040Stjr      /* This #include defines a local function!  */
3474146040Stjr# include <locale/weight.h>
3475146040Stjr      /* Calculate the index for equivalence class.  */
3476146040Stjr      cp = name;
3477146040Stjr      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3478146040Stjr      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3479146040Stjr					       _NL_COLLATE_WEIGHTMB);
3480146040Stjr      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3481146040Stjr						   _NL_COLLATE_EXTRAMB);
3482146040Stjr      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3483146040Stjr						_NL_COLLATE_INDIRECTMB);
3484146040Stjr      idx1 = findidx (&cp);
3485146040Stjr      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3486146040Stjr	/* This isn't a valid character.  */
3487146040Stjr	return REG_ECOLLATE;
3488146040Stjr
3489146040Stjr      /* Build single byte matcing table for this equivalence class.  */
3490146040Stjr      char_buf[1] = (unsigned char) '\0';
3491146040Stjr      len = weights[idx1];
3492146040Stjr      for (ch = 0; ch < SBC_MAX; ++ch)
3493146040Stjr	{
3494146040Stjr	  char_buf[0] = ch;
3495146040Stjr	  cp = char_buf;
3496146040Stjr	  idx2 = findidx (&cp);
3497146040Stjr/*
3498146040Stjr	  idx2 = table[ch];
3499146040Stjr*/
3500146040Stjr	  if (idx2 == 0)
3501146040Stjr	    /* This isn't a valid character.  */
3502146040Stjr	    continue;
3503146040Stjr	  if (len == weights[idx2])
3504146040Stjr	    {
3505146040Stjr	      int cnt = 0;
3506146040Stjr	      while (cnt <= len &&
3507146040Stjr		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3508146040Stjr		++cnt;
3509146040Stjr
3510146040Stjr	      if (cnt > len)
3511146040Stjr		bitset_set (sbcset, ch);
3512146040Stjr	    }
3513146040Stjr	}
3514146040Stjr      /* Check whether the array has enough space.  */
3515146040Stjr      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3516146040Stjr	{
3517146040Stjr	  /* Not enough, realloc it.  */
3518146040Stjr	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3519146040Stjr	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3520146040Stjr	  /* Use realloc since the array is NULL if *alloc == 0.  */
3521146040Stjr	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3522146040Stjr						   int32_t,
3523146040Stjr						   new_equiv_class_alloc);
3524146040Stjr	  if (BE (new_equiv_classes == NULL, 0))
3525146040Stjr	    return REG_ESPACE;
3526146040Stjr	  mbcset->equiv_classes = new_equiv_classes;
3527146040Stjr	  *equiv_class_alloc = new_equiv_class_alloc;
3528146040Stjr	}
3529146040Stjr      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3530146040Stjr    }
3531146040Stjr  else
3532146040Stjr#endif /* _LIBC */
3533146040Stjr    {
3534146040Stjr      if (BE (strlen ((const char *) name) != 1, 0))
3535146040Stjr	return REG_ECOLLATE;
3536146040Stjr      bitset_set (sbcset, *name);
3537146040Stjr    }
3538146040Stjr  return REG_NOERROR;
3539146040Stjr}
3540146040Stjr
3541146040Stjr  /* Helper function for parse_bracket_exp.
3542146040Stjr     Build the character class which is represented by NAME.
3543146040Stjr     The result are written to MBCSET and SBCSET.
3544146040Stjr     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3545146040Stjr     is a pointer argument sinse we may update it.  */
3546146040Stjr
3547146040Stjrstatic reg_errcode_t
3548146040Stjr#ifdef RE_ENABLE_I18N
3549146040Stjrbuild_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3550146040Stjr     re_charset_t *mbcset;
3551146040Stjr     int *char_class_alloc;
3552146040Stjr#else /* not RE_ENABLE_I18N */
3553146040Stjrbuild_charclass (trans, sbcset, class_name, syntax)
3554146040Stjr#endif /* not RE_ENABLE_I18N */
3555146040Stjr     unsigned RE_TRANSLATE_TYPE trans;
3556146040Stjr     re_bitset_ptr_t sbcset;
3557146040Stjr     const unsigned char *class_name;
3558146040Stjr     reg_syntax_t syntax;
3559146040Stjr{
3560146040Stjr  int i;
3561146040Stjr  const char *name = (const char *) class_name;
3562146040Stjr
3563146040Stjr  /* In case of REG_ICASE "upper" and "lower" match the both of
3564146040Stjr     upper and lower cases.  */
3565146040Stjr  if ((syntax & RE_ICASE)
3566146040Stjr      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3567146040Stjr    name = "alpha";
3568146040Stjr
3569146040Stjr#ifdef RE_ENABLE_I18N
3570146040Stjr  /* Check the space of the arrays.  */
3571146040Stjr  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3572146040Stjr    {
3573146040Stjr      /* Not enough, realloc it.  */
3574146040Stjr      /* +1 in case of mbcset->nchar_classes is 0.  */
3575146040Stjr      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3576146040Stjr      /* Use realloc since array is NULL if *alloc == 0.  */
3577146040Stjr      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3578146040Stjr					       new_char_class_alloc);
3579146040Stjr      if (BE (new_char_classes == NULL, 0))
3580146040Stjr	return REG_ESPACE;
3581146040Stjr      mbcset->char_classes = new_char_classes;
3582146040Stjr      *char_class_alloc = new_char_class_alloc;
3583146040Stjr    }
3584146040Stjr  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3585146040Stjr#endif /* RE_ENABLE_I18N */
3586146040Stjr
3587146040Stjr#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3588146040Stjr    for (i = 0; i < SBC_MAX; ++i)		\
3589146040Stjr      {						\
3590146040Stjr	if (ctype_func (i))			\
3591146040Stjr	  {					\
3592146040Stjr	    int ch = trans ? trans[i] : i;	\
3593146040Stjr	    bitset_set (sbcset, ch);		\
3594146040Stjr	  }					\
3595146040Stjr      }
3596146040Stjr
3597146040Stjr  if (strcmp (name, "alnum") == 0)
3598146040Stjr    BUILD_CHARCLASS_LOOP (isalnum)
3599146040Stjr  else if (strcmp (name, "cntrl") == 0)
3600146040Stjr    BUILD_CHARCLASS_LOOP (iscntrl)
3601146040Stjr  else if (strcmp (name, "lower") == 0)
3602146040Stjr    BUILD_CHARCLASS_LOOP (islower)
3603146040Stjr  else if (strcmp (name, "space") == 0)
3604146040Stjr    BUILD_CHARCLASS_LOOP (isspace)
3605146040Stjr  else if (strcmp (name, "alpha") == 0)
3606146040Stjr    BUILD_CHARCLASS_LOOP (isalpha)
3607146040Stjr  else if (strcmp (name, "digit") == 0)
3608146040Stjr    BUILD_CHARCLASS_LOOP (isdigit)
3609146040Stjr  else if (strcmp (name, "print") == 0)
3610146040Stjr    BUILD_CHARCLASS_LOOP (isprint)
3611146040Stjr  else if (strcmp (name, "upper") == 0)
3612146040Stjr    BUILD_CHARCLASS_LOOP (isupper)
3613146040Stjr  else if (strcmp (name, "blank") == 0)
3614146040Stjr    BUILD_CHARCLASS_LOOP (isblank)
3615146040Stjr  else if (strcmp (name, "graph") == 0)
3616146040Stjr    BUILD_CHARCLASS_LOOP (isgraph)
3617146040Stjr  else if (strcmp (name, "punct") == 0)
3618146040Stjr    BUILD_CHARCLASS_LOOP (ispunct)
3619146040Stjr  else if (strcmp (name, "xdigit") == 0)
3620146040Stjr    BUILD_CHARCLASS_LOOP (isxdigit)
3621146040Stjr  else
3622146040Stjr    return REG_ECTYPE;
3623146040Stjr
3624146040Stjr  return REG_NOERROR;
3625146040Stjr}
3626146040Stjr
3627146040Stjrstatic bin_tree_t *
3628146040Stjrbuild_charclass_op (dfa, trans, class_name, extra, non_match, err)
3629146040Stjr     re_dfa_t *dfa;
3630146040Stjr     unsigned RE_TRANSLATE_TYPE trans;
3631146040Stjr     const unsigned char *class_name;
3632146040Stjr     const unsigned char *extra;
3633146040Stjr     int non_match;
3634146040Stjr     reg_errcode_t *err;
3635146040Stjr{
3636146040Stjr  re_bitset_ptr_t sbcset;
3637146040Stjr#ifdef RE_ENABLE_I18N
3638146040Stjr  re_charset_t *mbcset;
3639146040Stjr  int alloc = 0;
3640146040Stjr#endif /* not RE_ENABLE_I18N */
3641146040Stjr  reg_errcode_t ret;
3642146040Stjr  re_token_t br_token;
3643146040Stjr  bin_tree_t *tree;
3644146040Stjr
3645146040Stjr  sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3646146040Stjr#ifdef RE_ENABLE_I18N
3647146040Stjr  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3648146040Stjr#endif /* RE_ENABLE_I18N */
3649146040Stjr
3650146040Stjr#ifdef RE_ENABLE_I18N
3651146040Stjr  if (BE (sbcset == NULL || mbcset == NULL, 0))
3652146040Stjr#else /* not RE_ENABLE_I18N */
3653146040Stjr  if (BE (sbcset == NULL, 0))
3654146040Stjr#endif /* not RE_ENABLE_I18N */
3655146040Stjr    {
3656146040Stjr      *err = REG_ESPACE;
3657146040Stjr      return NULL;
3658146040Stjr    }
3659146040Stjr
3660146040Stjr  if (non_match)
3661146040Stjr    {
3662146040Stjr#ifdef RE_ENABLE_I18N
3663146040Stjr      /*
3664146040Stjr      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3665146040Stjr	bitset_set(cset->sbcset, '\0');
3666146040Stjr      */
3667146040Stjr      mbcset->non_match = 1;
3668146040Stjr#endif /* not RE_ENABLE_I18N */
3669146040Stjr    }
3670146040Stjr
3671146040Stjr  /* We don't care the syntax in this case.  */
3672146040Stjr  ret = build_charclass (trans, sbcset,
3673146040Stjr#ifdef RE_ENABLE_I18N
3674146040Stjr			 mbcset, &alloc,
3675146040Stjr#endif /* RE_ENABLE_I18N */
3676146040Stjr			 class_name, 0);
3677146040Stjr
3678146040Stjr  if (BE (ret != REG_NOERROR, 0))
3679146040Stjr    {
3680146040Stjr      re_free (sbcset);
3681146040Stjr#ifdef RE_ENABLE_I18N
3682146040Stjr      free_charset (mbcset);
3683146040Stjr#endif /* RE_ENABLE_I18N */
3684146040Stjr      *err = ret;
3685146040Stjr      return NULL;
3686146040Stjr    }
3687146040Stjr  /* \w match '_' also.  */
3688146040Stjr  for (; *extra; extra++)
3689146040Stjr    bitset_set (sbcset, *extra);
3690146040Stjr
3691146040Stjr  /* If it is non-matching list.  */
3692146040Stjr  if (non_match)
3693146040Stjr    bitset_not (sbcset);
3694146040Stjr
3695146040Stjr#ifdef RE_ENABLE_I18N
3696146040Stjr  /* Ensure only single byte characters are set.  */
3697146040Stjr  if (dfa->mb_cur_max > 1)
3698146040Stjr    bitset_mask (sbcset, dfa->sb_char);
3699146040Stjr#endif
3700146040Stjr
3701146040Stjr  /* Build a tree for simple bracket.  */
3702146040Stjr  br_token.type = SIMPLE_BRACKET;
3703146040Stjr  br_token.opr.sbcset = sbcset;
3704146040Stjr  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3705146040Stjr  if (BE (tree == NULL, 0))
3706146040Stjr    goto build_word_op_espace;
3707146040Stjr
3708146040Stjr#ifdef RE_ENABLE_I18N
3709146040Stjr  if (dfa->mb_cur_max > 1)
3710146040Stjr    {
3711146040Stjr      bin_tree_t *mbc_tree;
3712146040Stjr      /* Build a tree for complex bracket.  */
3713146040Stjr      br_token.type = COMPLEX_BRACKET;
3714146040Stjr      br_token.opr.mbcset = mbcset;
3715146040Stjr      dfa->has_mb_node = 1;
3716146040Stjr      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3717146040Stjr      if (BE (mbc_tree == NULL, 0))
3718146040Stjr	goto build_word_op_espace;
3719146040Stjr      /* Then join them by ALT node.  */
3720146040Stjr      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3721146040Stjr      if (BE (mbc_tree != NULL, 1))
3722146040Stjr	return tree;
3723146040Stjr    }
3724146040Stjr  else
3725146040Stjr    {
3726146040Stjr      free_charset (mbcset);
3727146040Stjr      return tree;
3728146040Stjr    }
3729146040Stjr#else /* not RE_ENABLE_I18N */
3730146040Stjr  return tree;
3731146040Stjr#endif /* not RE_ENABLE_I18N */
3732146040Stjr
3733146040Stjr build_word_op_espace:
3734146040Stjr  re_free (sbcset);
3735146040Stjr#ifdef RE_ENABLE_I18N
3736146040Stjr  free_charset (mbcset);
3737146040Stjr#endif /* RE_ENABLE_I18N */
3738146040Stjr  *err = REG_ESPACE;
3739146040Stjr  return NULL;
3740146040Stjr}
3741146040Stjr
3742146040Stjr/* This is intended for the expressions like "a{1,3}".
3743146040Stjr   Fetch a number from `input', and return the number.
3744146040Stjr   Return -1, if the number field is empty like "{,1}".
3745146040Stjr   Return -2, If an error is occured.  */
3746146040Stjr
3747146040Stjrstatic int
3748146040Stjrfetch_number (input, token, syntax)
3749146040Stjr     re_string_t *input;
3750146040Stjr     re_token_t *token;
3751146040Stjr     reg_syntax_t syntax;
3752146040Stjr{
3753146040Stjr  int num = -1;
3754146040Stjr  unsigned char c;
3755146040Stjr  while (1)
3756146040Stjr    {
3757146040Stjr      fetch_token (token, input, syntax);
3758146040Stjr      c = token->opr.c;
3759146040Stjr      if (BE (token->type == END_OF_RE, 0))
3760146040Stjr	return -2;
3761146040Stjr      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3762146040Stjr	break;
3763146040Stjr      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3764146040Stjr	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3765146040Stjr      num = (num > RE_DUP_MAX) ? -2 : num;
3766146040Stjr    }
3767146040Stjr  return num;
3768146040Stjr}
3769146040Stjr
3770146040Stjr#ifdef RE_ENABLE_I18N
3771146040Stjrstatic void
3772146040Stjrfree_charset (re_charset_t *cset)
3773146040Stjr{
3774146040Stjr  re_free (cset->mbchars);
3775146040Stjr# ifdef _LIBC
3776146040Stjr  re_free (cset->coll_syms);
3777146040Stjr  re_free (cset->equiv_classes);
3778146040Stjr  re_free (cset->range_starts);
3779146040Stjr  re_free (cset->range_ends);
3780146040Stjr# endif
3781146040Stjr  re_free (cset->char_classes);
3782146040Stjr  re_free (cset);
3783146040Stjr}
3784146040Stjr#endif /* RE_ENABLE_I18N */
3785146040Stjr
3786146040Stjr/* Functions for binary tree operation.  */
3787146040Stjr
3788146040Stjr/* Create a tree node.  */
3789146040Stjr
3790146040Stjrstatic bin_tree_t *
3791146040Stjrcreate_tree (dfa, left, right, type)
3792146040Stjr     re_dfa_t *dfa;
3793146040Stjr     bin_tree_t *left;
3794146040Stjr     bin_tree_t *right;
3795146040Stjr     re_token_type_t type;
3796146040Stjr{
3797146040Stjr  re_token_t t;
3798146040Stjr  t.type = type;
3799146040Stjr  return create_token_tree (dfa, left, right, &t);
3800146040Stjr}
3801146040Stjr
3802146040Stjrstatic bin_tree_t *
3803146040Stjrcreate_token_tree (dfa, left, right, token)
3804146040Stjr     re_dfa_t *dfa;
3805146040Stjr     bin_tree_t *left;
3806146040Stjr     bin_tree_t *right;
3807146040Stjr     const re_token_t *token;
3808146040Stjr{
3809146040Stjr  bin_tree_t *tree;
3810146040Stjr  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3811146040Stjr    {
3812146040Stjr      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3813146040Stjr
3814146040Stjr      if (storage == NULL)
3815146040Stjr	return NULL;
3816146040Stjr      storage->next = dfa->str_tree_storage;
3817146040Stjr      dfa->str_tree_storage = storage;
3818146040Stjr      dfa->str_tree_storage_idx = 0;
3819146040Stjr    }
3820146040Stjr  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3821146040Stjr
3822146040Stjr  tree->parent = NULL;
3823146040Stjr  tree->left = left;
3824146040Stjr  tree->right = right;
3825146040Stjr  tree->token = *token;
3826146040Stjr  tree->token.duplicated = 0;
3827146040Stjr  tree->token.opt_subexp = 0;
3828146040Stjr  tree->first = NULL;
3829146040Stjr  tree->next = NULL;
3830146040Stjr  tree->node_idx = -1;
3831146040Stjr
3832146040Stjr  if (left != NULL)
3833146040Stjr    left->parent = tree;
3834146040Stjr  if (right != NULL)
3835146040Stjr    right->parent = tree;
3836146040Stjr  return tree;
3837146040Stjr}
3838146040Stjr
3839146040Stjr/* Mark the tree SRC as an optional subexpression.
3840146040Stjr   To be called from preorder or postorder.  */
3841146040Stjr
3842146040Stjrstatic reg_errcode_t
3843146040Stjrmark_opt_subexp (extra, node)
3844146040Stjr     void *extra;
3845146040Stjr     bin_tree_t *node;
3846146040Stjr{
3847146040Stjr  int idx = (int) (long) extra;
3848146040Stjr  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849146040Stjr    node->token.opt_subexp = 1;
3850146040Stjr
3851146040Stjr  return REG_NOERROR;
3852146040Stjr}
3853146040Stjr
3854146040Stjr/* Free the allocated memory inside NODE. */
3855146040Stjr
3856146040Stjrstatic void
3857146040Stjrfree_token (re_token_t *node)
3858146040Stjr{
3859146040Stjr#ifdef RE_ENABLE_I18N
3860146040Stjr  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861146040Stjr    free_charset (node->opr.mbcset);
3862146040Stjr  else
3863146040Stjr#endif /* RE_ENABLE_I18N */
3864146040Stjr    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865146040Stjr      re_free (node->opr.sbcset);
3866146040Stjr}
3867146040Stjr
3868146040Stjr/* Worker function for tree walking.  Free the allocated memory inside NODE
3869146040Stjr   and its children. */
3870146040Stjr
3871146040Stjrstatic reg_errcode_t
3872146040Stjrfree_tree (void *extra, bin_tree_t *node)
3873146040Stjr{
3874146040Stjr  free_token (&node->token);
3875146040Stjr  return REG_NOERROR;
3876146040Stjr}
3877146040Stjr
3878146040Stjr
3879146040Stjr/* Duplicate the node SRC, and return new node.  This is a preorder
3880146040Stjr   visit similar to the one implemented by the generic visitor, but
3881146040Stjr   we need more infrastructure to maintain two parallel trees --- so,
3882146040Stjr   it's easier to duplicate.  */
3883146040Stjr
3884146040Stjrstatic bin_tree_t *
3885146040Stjrduplicate_tree (root, dfa)
3886146040Stjr     const bin_tree_t *root;
3887146040Stjr     re_dfa_t *dfa;
3888146040Stjr{
3889146040Stjr  const bin_tree_t *node;
3890146040Stjr  bin_tree_t *dup_root;
3891146040Stjr  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3892146040Stjr
3893146040Stjr  for (node = root; ; )
3894146040Stjr    {
3895146040Stjr      /* Create a new tree and link it back to the current parent.  */
3896146040Stjr      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3897146040Stjr      if (*p_new == NULL)
3898146040Stjr	return NULL;
3899146040Stjr      (*p_new)->parent = dup_node;
3900146040Stjr      (*p_new)->token.duplicated = 1;
3901146040Stjr      dup_node = *p_new;
3902146040Stjr
3903146040Stjr      /* Go to the left node, or up and to the right.  */
3904146040Stjr      if (node->left)
3905146040Stjr	{
3906146040Stjr	  node = node->left;
3907146040Stjr	  p_new = &dup_node->left;
3908146040Stjr	}
3909146040Stjr      else
3910146040Stjr	{
3911146040Stjr	  const bin_tree_t *prev = NULL;
3912146040Stjr	  while (node->right == prev || node->right == NULL)
3913146040Stjr	    {
3914146040Stjr	      prev = node;
3915146040Stjr	      node = node->parent;
3916146040Stjr	      dup_node = dup_node->parent;
3917146040Stjr	      if (!node)
3918146040Stjr	        return dup_root;
3919146040Stjr	    }
3920146040Stjr	  node = node->right;
3921146040Stjr	  p_new = &dup_node->right;
3922146040Stjr	}
3923146040Stjr    }
3924146040Stjr}
3925