1146040Stjr/* Extended regular expression matching and search library. 2146040Stjr Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. 3146040Stjr This file is part of the GNU C Library. 4146040Stjr Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5146040Stjr 6146040Stjr The GNU C Library is free software; you can redistribute it and/or 7146040Stjr modify it under the terms of the GNU Lesser General Public 8146040Stjr License as published by the Free Software Foundation; either 9146040Stjr version 2.1 of the License, or (at your option) any later version. 10146040Stjr 11146040Stjr The GNU C Library is distributed in the hope that it will be useful, 12146040Stjr but WITHOUT ANY WARRANTY; without even the implied warranty of 13146040Stjr MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14146040Stjr Lesser General Public License for more details. 15146040Stjr 16146040Stjr You should have received a copy of the GNU Lesser General Public 17146040Stjr License along with the GNU C Library; if not, write to the Free 18146040Stjr Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19146040Stjr 02111-1307 USA. */ 20146040Stjr 21146040Stjrstatic reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 22146040Stjr int length, reg_syntax_t syntax); 23146040Stjrstatic void re_compile_fastmap_iter (regex_t *bufp, 24146040Stjr const re_dfastate_t *init_state, 25146040Stjr char *fastmap); 26146040Stjrstatic reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len); 27146040Stjrstatic void init_word_char (re_dfa_t *dfa); 28146040Stjr#ifdef RE_ENABLE_I18N 29146040Stjrstatic void free_charset (re_charset_t *cset); 30146040Stjr#endif /* RE_ENABLE_I18N */ 31146040Stjrstatic void free_workarea_compile (regex_t *preg); 32146040Stjrstatic reg_errcode_t create_initial_state (re_dfa_t *dfa); 33146040Stjr#ifdef RE_ENABLE_I18N 34146040Stjrstatic void optimize_utf8 (re_dfa_t *dfa); 35146040Stjr#endif 36146040Stjrstatic reg_errcode_t analyze (regex_t *preg); 37146040Stjrstatic reg_errcode_t create_initial_state (re_dfa_t *dfa); 38146040Stjrstatic reg_errcode_t preorder (bin_tree_t *root, 39146040Stjr reg_errcode_t (fn (void *, bin_tree_t *)), 40146040Stjr void *extra); 41146040Stjrstatic reg_errcode_t postorder (bin_tree_t *root, 42146040Stjr reg_errcode_t (fn (void *, bin_tree_t *)), 43146040Stjr void *extra); 44146040Stjrstatic reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 45146040Stjrstatic reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 46146040Stjrstatic bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 47146040Stjr bin_tree_t *node); 48146040Stjrstatic reg_errcode_t calc_first (void *extra, bin_tree_t *node); 49146040Stjrstatic reg_errcode_t calc_next (void *extra, bin_tree_t *node); 50146040Stjrstatic reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 51146040Stjrstatic reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node, 52146040Stjr int top_clone_node, int root_node, 53146040Stjr unsigned int constraint); 54146040Stjrstatic reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx, 55146040Stjr unsigned int constraint); 56146040Stjrstatic int search_duplicated_node (re_dfa_t *dfa, int org_node, 57146040Stjr unsigned int constraint); 58146040Stjrstatic reg_errcode_t calc_eclosure (re_dfa_t *dfa); 59146040Stjrstatic reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 60146040Stjr int node, int root); 61146040Stjrstatic reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 62146040Stjrstatic int fetch_number (re_string_t *input, re_token_t *token, 63146040Stjr reg_syntax_t syntax); 64146040Stjrstatic void fetch_token (re_token_t *result, re_string_t *input, 65146040Stjr reg_syntax_t syntax); 66146040Stjrstatic int peek_token (re_token_t *token, re_string_t *input, 67146040Stjr reg_syntax_t syntax); 68146040Stjrstatic int peek_token_bracket (re_token_t *token, re_string_t *input, 69146040Stjr reg_syntax_t syntax); 70146040Stjrstatic bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 71146040Stjr reg_syntax_t syntax, reg_errcode_t *err); 72146040Stjrstatic bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 73146040Stjr re_token_t *token, reg_syntax_t syntax, 74146040Stjr int nest, reg_errcode_t *err); 75146040Stjrstatic bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 76146040Stjr re_token_t *token, reg_syntax_t syntax, 77146040Stjr int nest, reg_errcode_t *err); 78146040Stjrstatic bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 79146040Stjr re_token_t *token, reg_syntax_t syntax, 80146040Stjr int nest, reg_errcode_t *err); 81146040Stjrstatic bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 82146040Stjr re_token_t *token, reg_syntax_t syntax, 83146040Stjr int nest, reg_errcode_t *err); 84146040Stjrstatic bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 85146040Stjr re_dfa_t *dfa, re_token_t *token, 86146040Stjr reg_syntax_t syntax, reg_errcode_t *err); 87146040Stjrstatic bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 88146040Stjr re_token_t *token, reg_syntax_t syntax, 89146040Stjr reg_errcode_t *err); 90146040Stjrstatic reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 91146040Stjr re_string_t *regexp, 92146040Stjr re_token_t *token, int token_len, 93146040Stjr re_dfa_t *dfa, 94146040Stjr reg_syntax_t syntax, 95146040Stjr int accept_hyphen); 96146040Stjrstatic reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 97146040Stjr re_string_t *regexp, 98146040Stjr re_token_t *token); 99146040Stjr#ifndef _LIBC 100146040Stjr# ifdef RE_ENABLE_I18N 101146040Stjrstatic reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, 102146040Stjr re_charset_t *mbcset, int *range_alloc, 103146040Stjr bracket_elem_t *start_elem, 104146040Stjr bracket_elem_t *end_elem); 105146040Stjrstatic reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, 106146040Stjr re_charset_t *mbcset, 107146040Stjr int *coll_sym_alloc, 108146040Stjr const unsigned char *name); 109146040Stjr# else /* not RE_ENABLE_I18N */ 110146040Stjrstatic reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, 111146040Stjr bracket_elem_t *start_elem, 112146040Stjr bracket_elem_t *end_elem); 113146040Stjrstatic reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, 114146040Stjr const unsigned char *name); 115146040Stjr# endif /* not RE_ENABLE_I18N */ 116146040Stjr#endif /* not _LIBC */ 117146040Stjr#ifdef RE_ENABLE_I18N 118146040Stjrstatic reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, 119146040Stjr re_charset_t *mbcset, 120146040Stjr int *equiv_class_alloc, 121146040Stjr const unsigned char *name); 122146040Stjrstatic reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans, 123146040Stjr re_bitset_ptr_t sbcset, 124146040Stjr re_charset_t *mbcset, 125146040Stjr int *char_class_alloc, 126146040Stjr const unsigned char *class_name, 127146040Stjr reg_syntax_t syntax); 128146040Stjr#else /* not RE_ENABLE_I18N */ 129146040Stjrstatic reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, 130146040Stjr const unsigned char *name); 131146040Stjrstatic reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans, 132146040Stjr re_bitset_ptr_t sbcset, 133146040Stjr const unsigned char *class_name, 134146040Stjr reg_syntax_t syntax); 135146040Stjr#endif /* not RE_ENABLE_I18N */ 136146040Stjrstatic bin_tree_t *build_charclass_op (re_dfa_t *dfa, 137146040Stjr unsigned RE_TRANSLATE_TYPE trans, 138146040Stjr const unsigned char *class_name, 139146040Stjr const unsigned char *extra, 140146040Stjr int non_match, reg_errcode_t *err); 141146040Stjrstatic bin_tree_t *create_tree (re_dfa_t *dfa, 142146040Stjr bin_tree_t *left, bin_tree_t *right, 143146040Stjr re_token_type_t type); 144146040Stjrstatic bin_tree_t *create_token_tree (re_dfa_t *dfa, 145146040Stjr bin_tree_t *left, bin_tree_t *right, 146146040Stjr const re_token_t *token); 147146040Stjrstatic bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 148146040Stjrstatic void free_token (re_token_t *node); 149146040Stjrstatic reg_errcode_t free_tree (void *extra, bin_tree_t *node); 150146040Stjrstatic reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 151146040Stjr 152146040Stjr/* This table gives an error message for each of the error codes listed 153146040Stjr in regex.h. Obviously the order here has to be same as there. 154146040Stjr POSIX doesn't require that we do anything for REG_NOERROR, 155146040Stjr but why not be nice? */ 156146040Stjr 157146040Stjrconst char __re_error_msgid[] attribute_hidden = 158146040Stjr { 159146040Stjr#define REG_NOERROR_IDX 0 160146040Stjr gettext_noop ("Success") /* REG_NOERROR */ 161146040Stjr "\0" 162146040Stjr#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 163146040Stjr gettext_noop ("No match") /* REG_NOMATCH */ 164146040Stjr "\0" 165146040Stjr#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 166146040Stjr gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 167146040Stjr "\0" 168146040Stjr#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 169146040Stjr gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 170146040Stjr "\0" 171146040Stjr#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 172146040Stjr gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 173146040Stjr "\0" 174146040Stjr#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 175146040Stjr gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 176146040Stjr "\0" 177146040Stjr#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 178146040Stjr gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 179146040Stjr "\0" 180146040Stjr#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 181146040Stjr gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 182146040Stjr "\0" 183146040Stjr#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 184146040Stjr gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 185146040Stjr "\0" 186146040Stjr#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 187146040Stjr gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 188146040Stjr "\0" 189146040Stjr#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 190146040Stjr gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 191146040Stjr "\0" 192146040Stjr#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 193146040Stjr gettext_noop ("Invalid range end") /* REG_ERANGE */ 194146040Stjr "\0" 195146040Stjr#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 196146040Stjr gettext_noop ("Memory exhausted") /* REG_ESPACE */ 197146040Stjr "\0" 198146040Stjr#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 199146040Stjr gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 200146040Stjr "\0" 201146040Stjr#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 202146040Stjr gettext_noop ("Premature end of regular expression") /* REG_EEND */ 203146040Stjr "\0" 204146040Stjr#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 205146040Stjr gettext_noop ("Regular expression too big") /* REG_ESIZE */ 206146040Stjr "\0" 207146040Stjr#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 208146040Stjr gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 209146040Stjr }; 210146040Stjr 211146040Stjrconst size_t __re_error_msgid_idx[] attribute_hidden = 212146040Stjr { 213146040Stjr REG_NOERROR_IDX, 214146040Stjr REG_NOMATCH_IDX, 215146040Stjr REG_BADPAT_IDX, 216146040Stjr REG_ECOLLATE_IDX, 217146040Stjr REG_ECTYPE_IDX, 218146040Stjr REG_EESCAPE_IDX, 219146040Stjr REG_ESUBREG_IDX, 220146040Stjr REG_EBRACK_IDX, 221146040Stjr REG_EPAREN_IDX, 222146040Stjr REG_EBRACE_IDX, 223146040Stjr REG_BADBR_IDX, 224146040Stjr REG_ERANGE_IDX, 225146040Stjr REG_ESPACE_IDX, 226146040Stjr REG_BADRPT_IDX, 227146040Stjr REG_EEND_IDX, 228146040Stjr REG_ESIZE_IDX, 229146040Stjr REG_ERPAREN_IDX 230146040Stjr }; 231146040Stjr 232146040Stjr/* Entry points for GNU code. */ 233146040Stjr 234146040Stjr/* re_compile_pattern is the GNU regular expression compiler: it 235146040Stjr compiles PATTERN (of length LENGTH) and puts the result in BUFP. 236146040Stjr Returns 0 if the pattern was valid, otherwise an error string. 237146040Stjr 238146040Stjr Assumes the `allocated' (and perhaps `buffer') and `translate' fields 239146040Stjr are set in BUFP on entry. */ 240146040Stjr 241146040Stjrconst char * 242146040Stjrre_compile_pattern (pattern, length, bufp) 243146040Stjr const char *pattern; 244146040Stjr size_t length; 245146040Stjr struct re_pattern_buffer *bufp; 246146040Stjr{ 247146040Stjr reg_errcode_t ret; 248146040Stjr 249146040Stjr /* And GNU code determines whether or not to get register information 250146040Stjr by passing null for the REGS argument to re_match, etc., not by 251146040Stjr setting no_sub, unless RE_NO_SUB is set. */ 252146040Stjr bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 253146040Stjr 254146040Stjr /* Match anchors at newline. */ 255146040Stjr bufp->newline_anchor = 1; 256146040Stjr 257146040Stjr ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 258146040Stjr 259146040Stjr if (!ret) 260146040Stjr return NULL; 261146040Stjr return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 262146040Stjr} 263146040Stjr#ifdef _LIBC 264146040Stjrweak_alias (__re_compile_pattern, re_compile_pattern) 265146040Stjr#endif 266146040Stjr 267146040Stjr/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 268146040Stjr also be assigned to arbitrarily: each pattern buffer stores its own 269146040Stjr syntax, so it can be changed between regex compilations. */ 270146040Stjr/* This has no initializer because initialized variables in Emacs 271146040Stjr become read-only after dumping. */ 272146040Stjrreg_syntax_t re_syntax_options; 273146040Stjr 274146040Stjr 275146040Stjr/* Specify the precise syntax of regexps for compilation. This provides 276146040Stjr for compatibility for various utilities which historically have 277146040Stjr different, incompatible syntaxes. 278146040Stjr 279146040Stjr The argument SYNTAX is a bit mask comprised of the various bits 280146040Stjr defined in regex.h. We return the old syntax. */ 281146040Stjr 282146040Stjrreg_syntax_t 283146040Stjrre_set_syntax (syntax) 284146040Stjr reg_syntax_t syntax; 285146040Stjr{ 286146040Stjr reg_syntax_t ret = re_syntax_options; 287146040Stjr 288146040Stjr re_syntax_options = syntax; 289146040Stjr return ret; 290146040Stjr} 291146040Stjr#ifdef _LIBC 292146040Stjrweak_alias (__re_set_syntax, re_set_syntax) 293146040Stjr#endif 294146040Stjr 295146040Stjrint 296146040Stjrre_compile_fastmap (bufp) 297146040Stjr struct re_pattern_buffer *bufp; 298146040Stjr{ 299146040Stjr re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 300146040Stjr char *fastmap = bufp->fastmap; 301146040Stjr 302146040Stjr memset (fastmap, '\0', sizeof (char) * SBC_MAX); 303146040Stjr re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 304146040Stjr if (dfa->init_state != dfa->init_state_word) 305146040Stjr re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 306146040Stjr if (dfa->init_state != dfa->init_state_nl) 307146040Stjr re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 308146040Stjr if (dfa->init_state != dfa->init_state_begbuf) 309146040Stjr re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 310146040Stjr bufp->fastmap_accurate = 1; 311146040Stjr return 0; 312146040Stjr} 313146040Stjr#ifdef _LIBC 314146040Stjrweak_alias (__re_compile_fastmap, re_compile_fastmap) 315146040Stjr#endif 316146040Stjr 317146040Stjrstatic inline void 318146040Stjr__attribute ((always_inline)) 319146040Stjrre_set_fastmap (char *fastmap, int icase, int ch) 320146040Stjr{ 321146040Stjr fastmap[ch] = 1; 322146040Stjr if (icase) 323146040Stjr fastmap[tolower (ch)] = 1; 324146040Stjr} 325146040Stjr 326146040Stjr/* Helper function for re_compile_fastmap. 327146040Stjr Compile fastmap for the initial_state INIT_STATE. */ 328146040Stjr 329146040Stjrstatic void 330146040Stjrre_compile_fastmap_iter (bufp, init_state, fastmap) 331146040Stjr regex_t *bufp; 332146040Stjr const re_dfastate_t *init_state; 333146040Stjr char *fastmap; 334146040Stjr{ 335146040Stjr re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 336146040Stjr int node_cnt; 337146040Stjr int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 338146040Stjr for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 339146040Stjr { 340146040Stjr int node = init_state->nodes.elems[node_cnt]; 341146040Stjr re_token_type_t type = dfa->nodes[node].type; 342146040Stjr 343146040Stjr if (type == CHARACTER) 344146040Stjr { 345146040Stjr re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 346146040Stjr#ifdef RE_ENABLE_I18N 347146040Stjr if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 348146040Stjr { 349146040Stjr unsigned char *buf = alloca (dfa->mb_cur_max), *p; 350146040Stjr wchar_t wc; 351146040Stjr mbstate_t state; 352146040Stjr 353146040Stjr p = buf; 354146040Stjr *p++ = dfa->nodes[node].opr.c; 355146040Stjr while (++node < dfa->nodes_len 356146040Stjr && dfa->nodes[node].type == CHARACTER 357146040Stjr && dfa->nodes[node].mb_partial) 358146040Stjr *p++ = dfa->nodes[node].opr.c; 359146040Stjr memset (&state, 0, sizeof (state)); 360146040Stjr if (mbrtowc (&wc, (const char *) buf, p - buf, 361146040Stjr &state) == p - buf 362146040Stjr && (__wcrtomb ((char *) buf, towlower (wc), &state) 363146040Stjr != (size_t) -1)) 364146040Stjr re_set_fastmap (fastmap, 0, buf[0]); 365146040Stjr } 366146040Stjr#endif 367146040Stjr } 368146040Stjr else if (type == SIMPLE_BRACKET) 369146040Stjr { 370146040Stjr int i, j, ch; 371146040Stjr for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 372146040Stjr for (j = 0; j < UINT_BITS; ++j, ++ch) 373146040Stjr if (dfa->nodes[node].opr.sbcset[i] & (1 << j)) 374146040Stjr re_set_fastmap (fastmap, icase, ch); 375146040Stjr } 376146040Stjr#ifdef RE_ENABLE_I18N 377146040Stjr else if (type == COMPLEX_BRACKET) 378146040Stjr { 379146040Stjr int i; 380146040Stjr re_charset_t *cset = dfa->nodes[node].opr.mbcset; 381146040Stjr if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes 382146040Stjr || cset->nranges || cset->nchar_classes) 383146040Stjr { 384146040Stjr# ifdef _LIBC 385146040Stjr if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) 386146040Stjr { 387146040Stjr /* In this case we want to catch the bytes which are 388146040Stjr the first byte of any collation elements. 389146040Stjr e.g. In da_DK, we want to catch 'a' since "aa" 390146040Stjr is a valid collation element, and don't catch 391146040Stjr 'b' since 'b' is the only collation element 392146040Stjr which starts from 'b'. */ 393146040Stjr int j, ch; 394146040Stjr const int32_t *table = (const int32_t *) 395146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 396146040Stjr for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 397146040Stjr for (j = 0; j < UINT_BITS; ++j, ++ch) 398146040Stjr if (table[ch] < 0) 399146040Stjr re_set_fastmap (fastmap, icase, ch); 400146040Stjr } 401146040Stjr# else 402146040Stjr if (dfa->mb_cur_max > 1) 403146040Stjr for (i = 0; i < SBC_MAX; ++i) 404146040Stjr if (__btowc (i) == WEOF) 405146040Stjr re_set_fastmap (fastmap, icase, i); 406146040Stjr# endif /* not _LIBC */ 407146040Stjr } 408146040Stjr for (i = 0; i < cset->nmbchars; ++i) 409146040Stjr { 410146040Stjr char buf[256]; 411146040Stjr mbstate_t state; 412146040Stjr memset (&state, '\0', sizeof (state)); 413146040Stjr if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 414146040Stjr re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 415146040Stjr if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 416146040Stjr { 417146040Stjr if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 418146040Stjr != (size_t) -1) 419146040Stjr re_set_fastmap (fastmap, 0, *(unsigned char *) buf); 420146040Stjr } 421146040Stjr } 422146040Stjr } 423146040Stjr#endif /* RE_ENABLE_I18N */ 424146040Stjr else if (type == OP_PERIOD 425146040Stjr#ifdef RE_ENABLE_I18N 426146040Stjr || type == OP_UTF8_PERIOD 427146040Stjr#endif /* RE_ENABLE_I18N */ 428146040Stjr || type == END_OF_RE) 429146040Stjr { 430146040Stjr memset (fastmap, '\1', sizeof (char) * SBC_MAX); 431146040Stjr if (type == END_OF_RE) 432146040Stjr bufp->can_be_null = 1; 433146040Stjr return; 434146040Stjr } 435146040Stjr } 436146040Stjr} 437146040Stjr 438146040Stjr/* Entry point for POSIX code. */ 439146040Stjr/* regcomp takes a regular expression as a string and compiles it. 440146040Stjr 441146040Stjr PREG is a regex_t *. We do not expect any fields to be initialized, 442146040Stjr since POSIX says we shouldn't. Thus, we set 443146040Stjr 444146040Stjr `buffer' to the compiled pattern; 445146040Stjr `used' to the length of the compiled pattern; 446146040Stjr `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 447146040Stjr REG_EXTENDED bit in CFLAGS is set; otherwise, to 448146040Stjr RE_SYNTAX_POSIX_BASIC; 449146040Stjr `newline_anchor' to REG_NEWLINE being set in CFLAGS; 450146040Stjr `fastmap' to an allocated space for the fastmap; 451146040Stjr `fastmap_accurate' to zero; 452146040Stjr `re_nsub' to the number of subexpressions in PATTERN. 453146040Stjr 454146040Stjr PATTERN is the address of the pattern string. 455146040Stjr 456146040Stjr CFLAGS is a series of bits which affect compilation. 457146040Stjr 458146040Stjr If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 459146040Stjr use POSIX basic syntax. 460146040Stjr 461146040Stjr If REG_NEWLINE is set, then . and [^...] don't match newline. 462146040Stjr Also, regexec will try a match beginning after every newline. 463146040Stjr 464146040Stjr If REG_ICASE is set, then we considers upper- and lowercase 465146040Stjr versions of letters to be equivalent when matching. 466146040Stjr 467146040Stjr If REG_NOSUB is set, then when PREG is passed to regexec, that 468146040Stjr routine will report only success or failure, and nothing about the 469146040Stjr registers. 470146040Stjr 471146040Stjr It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 472146040Stjr the return codes and their meanings.) */ 473146040Stjr 474146040Stjrint 475146040Stjrregcomp (preg, pattern, cflags) 476146040Stjr regex_t *__restrict preg; 477146040Stjr const char *__restrict pattern; 478146040Stjr int cflags; 479146040Stjr{ 480146040Stjr reg_errcode_t ret; 481146040Stjr reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 482146040Stjr : RE_SYNTAX_POSIX_BASIC); 483146040Stjr 484146040Stjr preg->buffer = NULL; 485146040Stjr preg->allocated = 0; 486146040Stjr preg->used = 0; 487146040Stjr 488146040Stjr /* Try to allocate space for the fastmap. */ 489146040Stjr preg->fastmap = re_malloc (char, SBC_MAX); 490146040Stjr if (BE (preg->fastmap == NULL, 0)) 491146040Stjr return REG_ESPACE; 492146040Stjr 493146040Stjr syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 494146040Stjr 495146040Stjr /* If REG_NEWLINE is set, newlines are treated differently. */ 496146040Stjr if (cflags & REG_NEWLINE) 497146040Stjr { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 498146040Stjr syntax &= ~RE_DOT_NEWLINE; 499146040Stjr syntax |= RE_HAT_LISTS_NOT_NEWLINE; 500146040Stjr /* It also changes the matching behavior. */ 501146040Stjr preg->newline_anchor = 1; 502146040Stjr } 503146040Stjr else 504146040Stjr preg->newline_anchor = 0; 505146040Stjr preg->no_sub = !!(cflags & REG_NOSUB); 506146040Stjr preg->translate = NULL; 507146040Stjr 508146040Stjr ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 509146040Stjr 510146040Stjr /* POSIX doesn't distinguish between an unmatched open-group and an 511146040Stjr unmatched close-group: both are REG_EPAREN. */ 512146040Stjr if (ret == REG_ERPAREN) 513146040Stjr ret = REG_EPAREN; 514146040Stjr 515146040Stjr /* We have already checked preg->fastmap != NULL. */ 516146040Stjr if (BE (ret == REG_NOERROR, 1)) 517146040Stjr /* Compute the fastmap now, since regexec cannot modify the pattern 518146040Stjr buffer. This function never fails in this implementation. */ 519146040Stjr (void) re_compile_fastmap (preg); 520146040Stjr else 521146040Stjr { 522146040Stjr /* Some error occurred while compiling the expression. */ 523146040Stjr re_free (preg->fastmap); 524146040Stjr preg->fastmap = NULL; 525146040Stjr } 526146040Stjr 527146040Stjr return (int) ret; 528146040Stjr} 529146040Stjr#ifdef _LIBC 530146040Stjrweak_alias (__regcomp, regcomp) 531146040Stjr#endif 532146040Stjr 533146040Stjr/* Returns a message corresponding to an error code, ERRCODE, returned 534146040Stjr from either regcomp or regexec. We don't use PREG here. */ 535146040Stjr 536146040Stjrsize_t 537146040Stjrregerror (errcode, preg, errbuf, errbuf_size) 538146040Stjr int errcode; 539146040Stjr const regex_t *preg; 540146040Stjr char *errbuf; 541146040Stjr size_t errbuf_size; 542146040Stjr{ 543146040Stjr const char *msg; 544146040Stjr size_t msg_size; 545146040Stjr 546146040Stjr if (BE (errcode < 0 547146040Stjr || errcode >= (int) (sizeof (__re_error_msgid_idx) 548146040Stjr / sizeof (__re_error_msgid_idx[0])), 0)) 549146040Stjr /* Only error codes returned by the rest of the code should be passed 550146040Stjr to this routine. If we are given anything else, or if other regex 551146040Stjr code generates an invalid error code, then the program has a bug. 552146040Stjr Dump core so we can fix it. */ 553146040Stjr abort (); 554146040Stjr 555146040Stjr msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 556146040Stjr 557146040Stjr msg_size = strlen (msg) + 1; /* Includes the null. */ 558146040Stjr 559146040Stjr if (BE (errbuf_size != 0, 1)) 560146040Stjr { 561146040Stjr if (BE (msg_size > errbuf_size, 0)) 562146040Stjr { 563146040Stjr#if defined HAVE_MEMPCPY || defined _LIBC 564146040Stjr *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 565146040Stjr#else 566146040Stjr memcpy (errbuf, msg, errbuf_size - 1); 567146040Stjr errbuf[errbuf_size - 1] = 0; 568146040Stjr#endif 569146040Stjr } 570146040Stjr else 571146040Stjr memcpy (errbuf, msg, msg_size); 572146040Stjr } 573146040Stjr 574146040Stjr return msg_size; 575146040Stjr} 576146040Stjr#ifdef _LIBC 577146040Stjrweak_alias (__regerror, regerror) 578146040Stjr#endif 579146040Stjr 580146040Stjr 581146040Stjr#ifdef RE_ENABLE_I18N 582146040Stjr/* This static array is used for the map to single-byte characters when 583146040Stjr UTF-8 is used. Otherwise we would allocate memory just to initialize 584146040Stjr it the same all the time. UTF-8 is the preferred encoding so this is 585146040Stjr a worthwhile optimization. */ 586146040Stjrstatic const bitset utf8_sb_map = 587146040Stjr{ 588146040Stjr /* Set the first 128 bits. */ 589146040Stjr# if UINT_MAX == 0xffffffff 590146040Stjr 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff 591146040Stjr# else 592146040Stjr# error "Add case for new unsigned int size" 593146040Stjr# endif 594146040Stjr}; 595146040Stjr#endif 596146040Stjr 597146040Stjr 598146040Stjrstatic void 599146040Stjrfree_dfa_content (re_dfa_t *dfa) 600146040Stjr{ 601146040Stjr int i, j; 602146040Stjr 603146040Stjr if (dfa->nodes) 604146040Stjr for (i = 0; i < dfa->nodes_len; ++i) 605146040Stjr free_token (dfa->nodes + i); 606146040Stjr re_free (dfa->nexts); 607146040Stjr for (i = 0; i < dfa->nodes_len; ++i) 608146040Stjr { 609146040Stjr if (dfa->eclosures != NULL) 610146040Stjr re_node_set_free (dfa->eclosures + i); 611146040Stjr if (dfa->inveclosures != NULL) 612146040Stjr re_node_set_free (dfa->inveclosures + i); 613146040Stjr if (dfa->edests != NULL) 614146040Stjr re_node_set_free (dfa->edests + i); 615146040Stjr } 616146040Stjr re_free (dfa->edests); 617146040Stjr re_free (dfa->eclosures); 618146040Stjr re_free (dfa->inveclosures); 619146040Stjr re_free (dfa->nodes); 620146040Stjr 621146040Stjr if (dfa->state_table) 622146040Stjr for (i = 0; i <= dfa->state_hash_mask; ++i) 623146040Stjr { 624146040Stjr struct re_state_table_entry *entry = dfa->state_table + i; 625146040Stjr for (j = 0; j < entry->num; ++j) 626146040Stjr { 627146040Stjr re_dfastate_t *state = entry->array[j]; 628146040Stjr free_state (state); 629146040Stjr } 630146040Stjr re_free (entry->array); 631146040Stjr } 632146040Stjr re_free (dfa->state_table); 633146040Stjr#ifdef RE_ENABLE_I18N 634146040Stjr if (dfa->sb_char != utf8_sb_map) 635146040Stjr re_free (dfa->sb_char); 636146040Stjr#endif 637146040Stjr re_free (dfa->subexp_map); 638146040Stjr#ifdef DEBUG 639146040Stjr re_free (dfa->re_str); 640146040Stjr#endif 641146040Stjr 642146040Stjr re_free (dfa); 643146040Stjr} 644146040Stjr 645146040Stjr 646146040Stjr/* Free dynamically allocated space used by PREG. */ 647146040Stjr 648146040Stjrvoid 649146040Stjrregfree (preg) 650146040Stjr regex_t *preg; 651146040Stjr{ 652146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 653146040Stjr if (BE (dfa != NULL, 1)) 654146040Stjr free_dfa_content (dfa); 655146040Stjr preg->buffer = NULL; 656146040Stjr preg->allocated = 0; 657146040Stjr 658146040Stjr re_free (preg->fastmap); 659146040Stjr preg->fastmap = NULL; 660146040Stjr 661146040Stjr re_free (preg->translate); 662146040Stjr preg->translate = NULL; 663146040Stjr} 664146040Stjr#ifdef _LIBC 665146040Stjrweak_alias (__regfree, regfree) 666146040Stjr#endif 667146040Stjr 668146040Stjr/* Entry points compatible with 4.2 BSD regex library. We don't define 669146040Stjr them unless specifically requested. */ 670146040Stjr 671146040Stjr#if defined _REGEX_RE_COMP || defined _LIBC 672146040Stjr 673146040Stjr/* BSD has one and only one pattern buffer. */ 674146040Stjrstatic struct re_pattern_buffer re_comp_buf; 675146040Stjr 676146040Stjrchar * 677146040Stjr# ifdef _LIBC 678146040Stjr/* Make these definitions weak in libc, so POSIX programs can redefine 679146040Stjr these names if they don't use our functions, and still use 680146040Stjr regcomp/regexec above without link errors. */ 681146040Stjrweak_function 682146040Stjr# endif 683146040Stjrre_comp (s) 684146040Stjr const char *s; 685146040Stjr{ 686146040Stjr reg_errcode_t ret; 687146040Stjr char *fastmap; 688146040Stjr 689146040Stjr if (!s) 690146040Stjr { 691146040Stjr if (!re_comp_buf.buffer) 692146040Stjr return gettext ("No previous regular expression"); 693146040Stjr return 0; 694146040Stjr } 695146040Stjr 696146040Stjr if (re_comp_buf.buffer) 697146040Stjr { 698146040Stjr fastmap = re_comp_buf.fastmap; 699146040Stjr re_comp_buf.fastmap = NULL; 700146040Stjr __regfree (&re_comp_buf); 701146040Stjr memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 702146040Stjr re_comp_buf.fastmap = fastmap; 703146040Stjr } 704146040Stjr 705146040Stjr if (re_comp_buf.fastmap == NULL) 706146040Stjr { 707146040Stjr re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 708146040Stjr if (re_comp_buf.fastmap == NULL) 709146040Stjr return (char *) gettext (__re_error_msgid 710146040Stjr + __re_error_msgid_idx[(int) REG_ESPACE]); 711146040Stjr } 712146040Stjr 713146040Stjr /* Since `re_exec' always passes NULL for the `regs' argument, we 714146040Stjr don't need to initialize the pattern buffer fields which affect it. */ 715146040Stjr 716146040Stjr /* Match anchors at newlines. */ 717146040Stjr re_comp_buf.newline_anchor = 1; 718146040Stjr 719146040Stjr ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 720146040Stjr 721146040Stjr if (!ret) 722146040Stjr return NULL; 723146040Stjr 724146040Stjr /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 725146040Stjr return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 726146040Stjr} 727146040Stjr 728146040Stjr#ifdef _LIBC 729146040Stjrlibc_freeres_fn (free_mem) 730146040Stjr{ 731146040Stjr __regfree (&re_comp_buf); 732146040Stjr} 733146040Stjr#endif 734146040Stjr 735146040Stjr#endif /* _REGEX_RE_COMP */ 736146040Stjr 737146040Stjr/* Internal entry point. 738146040Stjr Compile the regular expression PATTERN, whose length is LENGTH. 739146040Stjr SYNTAX indicate regular expression's syntax. */ 740146040Stjr 741146040Stjrstatic reg_errcode_t 742146040Stjrre_compile_internal (preg, pattern, length, syntax) 743146040Stjr regex_t *preg; 744146040Stjr const char * pattern; 745146040Stjr int length; 746146040Stjr reg_syntax_t syntax; 747146040Stjr{ 748146040Stjr reg_errcode_t err = REG_NOERROR; 749146040Stjr re_dfa_t *dfa; 750146040Stjr re_string_t regexp; 751146040Stjr 752146040Stjr /* Initialize the pattern buffer. */ 753146040Stjr preg->fastmap_accurate = 0; 754146040Stjr preg->syntax = syntax; 755146040Stjr preg->not_bol = preg->not_eol = 0; 756146040Stjr preg->used = 0; 757146040Stjr preg->re_nsub = 0; 758146040Stjr preg->can_be_null = 0; 759146040Stjr preg->regs_allocated = REGS_UNALLOCATED; 760146040Stjr 761146040Stjr /* Initialize the dfa. */ 762146040Stjr dfa = (re_dfa_t *) preg->buffer; 763146040Stjr if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 764146040Stjr { 765146040Stjr /* If zero allocated, but buffer is non-null, try to realloc 766146040Stjr enough space. This loses if buffer's address is bogus, but 767146040Stjr that is the user's responsibility. If ->buffer is NULL this 768146040Stjr is a simple allocation. */ 769146040Stjr dfa = re_realloc (preg->buffer, re_dfa_t, 1); 770146040Stjr if (dfa == NULL) 771146040Stjr return REG_ESPACE; 772146040Stjr preg->allocated = sizeof (re_dfa_t); 773146040Stjr preg->buffer = (unsigned char *) dfa; 774146040Stjr } 775146040Stjr preg->used = sizeof (re_dfa_t); 776146040Stjr 777146040Stjr err = init_dfa (dfa, length); 778146040Stjr if (BE (err != REG_NOERROR, 0)) 779146040Stjr { 780146040Stjr free_dfa_content (dfa); 781146040Stjr preg->buffer = NULL; 782146040Stjr preg->allocated = 0; 783146040Stjr return err; 784146040Stjr } 785146040Stjr#ifdef DEBUG 786146040Stjr dfa->re_str = re_malloc (char, length + 1); 787146040Stjr strncpy (dfa->re_str, pattern, length + 1); 788146040Stjr#endif 789146040Stjr 790146040Stjr err = re_string_construct (®exp, pattern, length, preg->translate, 791146040Stjr syntax & RE_ICASE, dfa); 792146040Stjr if (BE (err != REG_NOERROR, 0)) 793146040Stjr { 794146040Stjr re_compile_internal_free_return: 795146040Stjr free_workarea_compile (preg); 796146040Stjr re_string_destruct (®exp); 797146040Stjr free_dfa_content (dfa); 798146040Stjr preg->buffer = NULL; 799146040Stjr preg->allocated = 0; 800146040Stjr return err; 801146040Stjr } 802146040Stjr 803146040Stjr /* Parse the regular expression, and build a structure tree. */ 804146040Stjr preg->re_nsub = 0; 805146040Stjr dfa->str_tree = parse (®exp, preg, syntax, &err); 806146040Stjr if (BE (dfa->str_tree == NULL, 0)) 807146040Stjr goto re_compile_internal_free_return; 808146040Stjr 809146040Stjr /* Analyze the tree and create the nfa. */ 810146040Stjr err = analyze (preg); 811146040Stjr if (BE (err != REG_NOERROR, 0)) 812146040Stjr goto re_compile_internal_free_return; 813146040Stjr 814146040Stjr#ifdef RE_ENABLE_I18N 815146040Stjr /* If possible, do searching in single byte encoding to speed things up. */ 816146040Stjr if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 817146040Stjr optimize_utf8 (dfa); 818146040Stjr#endif 819146040Stjr 820146040Stjr /* Then create the initial state of the dfa. */ 821146040Stjr err = create_initial_state (dfa); 822146040Stjr 823146040Stjr /* Release work areas. */ 824146040Stjr free_workarea_compile (preg); 825146040Stjr re_string_destruct (®exp); 826146040Stjr 827146040Stjr if (BE (err != REG_NOERROR, 0)) 828146040Stjr { 829146040Stjr free_dfa_content (dfa); 830146040Stjr preg->buffer = NULL; 831146040Stjr preg->allocated = 0; 832146040Stjr } 833146040Stjr 834146040Stjr return err; 835146040Stjr} 836146040Stjr 837146040Stjr/* Initialize DFA. We use the length of the regular expression PAT_LEN 838146040Stjr as the initial length of some arrays. */ 839146040Stjr 840146040Stjrstatic reg_errcode_t 841146040Stjrinit_dfa (dfa, pat_len) 842146040Stjr re_dfa_t *dfa; 843146040Stjr int pat_len; 844146040Stjr{ 845146040Stjr int table_size; 846146040Stjr#ifndef _LIBC 847146040Stjr char *codeset_name; 848146040Stjr#endif 849146040Stjr 850146040Stjr memset (dfa, '\0', sizeof (re_dfa_t)); 851146040Stjr 852146040Stjr /* Force allocation of str_tree_storage the first time. */ 853146040Stjr dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 854146040Stjr 855146040Stjr dfa->nodes_alloc = pat_len + 1; 856146040Stjr dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 857146040Stjr 858146040Stjr dfa->states_alloc = pat_len + 1; 859146040Stjr 860146040Stjr /* table_size = 2 ^ ceil(log pat_len) */ 861146040Stjr for (table_size = 1; table_size > 0; table_size <<= 1) 862146040Stjr if (table_size > pat_len) 863146040Stjr break; 864146040Stjr 865146040Stjr dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 866146040Stjr dfa->state_hash_mask = table_size - 1; 867146040Stjr 868146040Stjr dfa->mb_cur_max = MB_CUR_MAX; 869146040Stjr#ifdef _LIBC 870146040Stjr if (dfa->mb_cur_max == 6 871146040Stjr && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 872146040Stjr dfa->is_utf8 = 1; 873146040Stjr dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 874146040Stjr != 0); 875146040Stjr#else 876146040Stjr# ifdef HAVE_LANGINFO_CODESET 877146040Stjr codeset_name = nl_langinfo (CODESET); 878146040Stjr# else 879146040Stjr codeset_name = getenv ("LC_ALL"); 880146040Stjr if (codeset_name == NULL || codeset_name[0] == '\0') 881146040Stjr codeset_name = getenv ("LC_CTYPE"); 882146040Stjr if (codeset_name == NULL || codeset_name[0] == '\0') 883146040Stjr codeset_name = getenv ("LANG"); 884146040Stjr if (codeset_name == NULL) 885146040Stjr codeset_name = ""; 886146040Stjr else if (strchr (codeset_name, '.') != NULL) 887146040Stjr codeset_name = strchr (codeset_name, '.') + 1; 888146040Stjr# endif 889146040Stjr 890146040Stjr if (strcasecmp (codeset_name, "UTF-8") == 0 891146040Stjr || strcasecmp (codeset_name, "UTF8") == 0) 892146040Stjr dfa->is_utf8 = 1; 893146040Stjr 894146040Stjr /* We check exhaustively in the loop below if this charset is a 895146040Stjr superset of ASCII. */ 896146040Stjr dfa->map_notascii = 0; 897146040Stjr#endif 898146040Stjr 899146040Stjr#ifdef RE_ENABLE_I18N 900146040Stjr if (dfa->mb_cur_max > 1) 901146040Stjr { 902146040Stjr if (dfa->is_utf8) 903146040Stjr dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 904146040Stjr else 905146040Stjr { 906146040Stjr int i, j, ch; 907146040Stjr 908146040Stjr dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); 909146040Stjr if (BE (dfa->sb_char == NULL, 0)) 910146040Stjr return REG_ESPACE; 911146040Stjr 912146040Stjr /* Clear all bits by, then set those corresponding to single 913146040Stjr byte chars. */ 914146040Stjr bitset_empty (dfa->sb_char); 915146040Stjr 916146040Stjr for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 917146040Stjr for (j = 0; j < UINT_BITS; ++j, ++ch) 918146040Stjr { 919146040Stjr wchar_t wch = __btowc (ch); 920146040Stjr if (wch != WEOF) 921146040Stjr dfa->sb_char[i] |= 1 << j; 922146040Stjr# ifndef _LIBC 923146040Stjr if (isascii (ch) && wch != (wchar_t) ch) 924146040Stjr dfa->map_notascii = 1; 925146040Stjr# endif 926146040Stjr } 927146040Stjr } 928146040Stjr } 929146040Stjr#endif 930146040Stjr 931146040Stjr if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 932146040Stjr return REG_ESPACE; 933146040Stjr return REG_NOERROR; 934146040Stjr} 935146040Stjr 936146040Stjr/* Initialize WORD_CHAR table, which indicate which character is 937146040Stjr "word". In this case "word" means that it is the word construction 938146040Stjr character used by some operators like "\<", "\>", etc. */ 939146040Stjr 940146040Stjrstatic void 941146040Stjrinit_word_char (dfa) 942146040Stjr re_dfa_t *dfa; 943146040Stjr{ 944146040Stjr int i, j, ch; 945146040Stjr dfa->word_ops_used = 1; 946146040Stjr for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 947146040Stjr for (j = 0; j < UINT_BITS; ++j, ++ch) 948146040Stjr if (isalnum (ch) || ch == '_') 949146040Stjr dfa->word_char[i] |= 1 << j; 950146040Stjr} 951146040Stjr 952146040Stjr/* Free the work area which are only used while compiling. */ 953146040Stjr 954146040Stjrstatic void 955146040Stjrfree_workarea_compile (preg) 956146040Stjr regex_t *preg; 957146040Stjr{ 958146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 959146040Stjr bin_tree_storage_t *storage, *next; 960146040Stjr for (storage = dfa->str_tree_storage; storage; storage = next) 961146040Stjr { 962146040Stjr next = storage->next; 963146040Stjr re_free (storage); 964146040Stjr } 965146040Stjr dfa->str_tree_storage = NULL; 966146040Stjr dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 967146040Stjr dfa->str_tree = NULL; 968146040Stjr re_free (dfa->org_indices); 969146040Stjr dfa->org_indices = NULL; 970146040Stjr} 971146040Stjr 972146040Stjr/* Create initial states for all contexts. */ 973146040Stjr 974146040Stjrstatic reg_errcode_t 975146040Stjrcreate_initial_state (dfa) 976146040Stjr re_dfa_t *dfa; 977146040Stjr{ 978146040Stjr int first, i; 979146040Stjr reg_errcode_t err; 980146040Stjr re_node_set init_nodes; 981146040Stjr 982146040Stjr /* Initial states have the epsilon closure of the node which is 983146040Stjr the first node of the regular expression. */ 984146040Stjr first = dfa->str_tree->first->node_idx; 985146040Stjr dfa->init_node = first; 986146040Stjr err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 987146040Stjr if (BE (err != REG_NOERROR, 0)) 988146040Stjr return err; 989146040Stjr 990146040Stjr /* The back-references which are in initial states can epsilon transit, 991146040Stjr since in this case all of the subexpressions can be null. 992146040Stjr Then we add epsilon closures of the nodes which are the next nodes of 993146040Stjr the back-references. */ 994146040Stjr if (dfa->nbackref > 0) 995146040Stjr for (i = 0; i < init_nodes.nelem; ++i) 996146040Stjr { 997146040Stjr int node_idx = init_nodes.elems[i]; 998146040Stjr re_token_type_t type = dfa->nodes[node_idx].type; 999146040Stjr 1000146040Stjr int clexp_idx; 1001146040Stjr if (type != OP_BACK_REF) 1002146040Stjr continue; 1003146040Stjr for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1004146040Stjr { 1005146040Stjr re_token_t *clexp_node; 1006146040Stjr clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1007146040Stjr if (clexp_node->type == OP_CLOSE_SUBEXP 1008146040Stjr && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1009146040Stjr break; 1010146040Stjr } 1011146040Stjr if (clexp_idx == init_nodes.nelem) 1012146040Stjr continue; 1013146040Stjr 1014146040Stjr if (type == OP_BACK_REF) 1015146040Stjr { 1016146040Stjr int dest_idx = dfa->edests[node_idx].elems[0]; 1017146040Stjr if (!re_node_set_contains (&init_nodes, dest_idx)) 1018146040Stjr { 1019146040Stjr re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1020146040Stjr i = 0; 1021146040Stjr } 1022146040Stjr } 1023146040Stjr } 1024146040Stjr 1025146040Stjr /* It must be the first time to invoke acquire_state. */ 1026146040Stjr dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1027146040Stjr /* We don't check ERR here, since the initial state must not be NULL. */ 1028146040Stjr if (BE (dfa->init_state == NULL, 0)) 1029146040Stjr return err; 1030146040Stjr if (dfa->init_state->has_constraint) 1031146040Stjr { 1032146040Stjr dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1033146040Stjr CONTEXT_WORD); 1034146040Stjr dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1035146040Stjr CONTEXT_NEWLINE); 1036146040Stjr dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1037146040Stjr &init_nodes, 1038146040Stjr CONTEXT_NEWLINE 1039146040Stjr | CONTEXT_BEGBUF); 1040146040Stjr if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1041146040Stjr || dfa->init_state_begbuf == NULL, 0)) 1042146040Stjr return err; 1043146040Stjr } 1044146040Stjr else 1045146040Stjr dfa->init_state_word = dfa->init_state_nl 1046146040Stjr = dfa->init_state_begbuf = dfa->init_state; 1047146040Stjr 1048146040Stjr re_node_set_free (&init_nodes); 1049146040Stjr return REG_NOERROR; 1050146040Stjr} 1051146040Stjr 1052146040Stjr#ifdef RE_ENABLE_I18N 1053146040Stjr/* If it is possible to do searching in single byte encoding instead of UTF-8 1054146040Stjr to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1055146040Stjr DFA nodes where needed. */ 1056146040Stjr 1057146040Stjrstatic void 1058146040Stjroptimize_utf8 (dfa) 1059146040Stjr re_dfa_t *dfa; 1060146040Stjr{ 1061146040Stjr int node, i, mb_chars = 0, has_period = 0; 1062146040Stjr 1063146040Stjr for (node = 0; node < dfa->nodes_len; ++node) 1064146040Stjr switch (dfa->nodes[node].type) 1065146040Stjr { 1066146040Stjr case CHARACTER: 1067146040Stjr if (dfa->nodes[node].opr.c >= 0x80) 1068146040Stjr mb_chars = 1; 1069146040Stjr break; 1070146040Stjr case ANCHOR: 1071146040Stjr switch (dfa->nodes[node].opr.idx) 1072146040Stjr { 1073146040Stjr case LINE_FIRST: 1074146040Stjr case LINE_LAST: 1075146040Stjr case BUF_FIRST: 1076146040Stjr case BUF_LAST: 1077146040Stjr break; 1078146040Stjr default: 1079146040Stjr /* Word anchors etc. cannot be handled. */ 1080146040Stjr return; 1081146040Stjr } 1082146040Stjr break; 1083146040Stjr case OP_PERIOD: 1084146040Stjr has_period = 1; 1085146040Stjr break; 1086146040Stjr case OP_BACK_REF: 1087146040Stjr case OP_ALT: 1088146040Stjr case END_OF_RE: 1089146040Stjr case OP_DUP_ASTERISK: 1090146040Stjr case OP_OPEN_SUBEXP: 1091146040Stjr case OP_CLOSE_SUBEXP: 1092146040Stjr break; 1093146040Stjr case COMPLEX_BRACKET: 1094146040Stjr return; 1095146040Stjr case SIMPLE_BRACKET: 1096146040Stjr /* Just double check. */ 1097146040Stjr for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i) 1098146040Stjr if (dfa->nodes[node].opr.sbcset[i]) 1099146040Stjr return; 1100146040Stjr break; 1101146040Stjr default: 1102146040Stjr abort (); 1103146040Stjr } 1104146040Stjr 1105146040Stjr if (mb_chars || has_period) 1106146040Stjr for (node = 0; node < dfa->nodes_len; ++node) 1107146040Stjr { 1108146040Stjr if (dfa->nodes[node].type == CHARACTER 1109146040Stjr && dfa->nodes[node].opr.c >= 0x80) 1110146040Stjr dfa->nodes[node].mb_partial = 0; 1111146040Stjr else if (dfa->nodes[node].type == OP_PERIOD) 1112146040Stjr dfa->nodes[node].type = OP_UTF8_PERIOD; 1113146040Stjr } 1114146040Stjr 1115146040Stjr /* The search can be in single byte locale. */ 1116146040Stjr dfa->mb_cur_max = 1; 1117146040Stjr dfa->is_utf8 = 0; 1118146040Stjr dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1119146040Stjr} 1120146040Stjr#endif 1121146040Stjr 1122146040Stjr/* Analyze the structure tree, and calculate "first", "next", "edest", 1123146040Stjr "eclosure", and "inveclosure". */ 1124146040Stjr 1125146040Stjrstatic reg_errcode_t 1126146040Stjranalyze (preg) 1127146040Stjr regex_t *preg; 1128146040Stjr{ 1129146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1130146040Stjr reg_errcode_t ret; 1131146040Stjr 1132146040Stjr /* Allocate arrays. */ 1133146040Stjr dfa->nexts = re_malloc (int, dfa->nodes_alloc); 1134146040Stjr dfa->org_indices = re_malloc (int, dfa->nodes_alloc); 1135146040Stjr dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1136146040Stjr dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1137146040Stjr if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1138146040Stjr || dfa->eclosures == NULL, 0)) 1139146040Stjr return REG_ESPACE; 1140146040Stjr 1141146040Stjr dfa->subexp_map = re_malloc (int, preg->re_nsub); 1142146040Stjr if (dfa->subexp_map != NULL) 1143146040Stjr { 1144146040Stjr int i; 1145146040Stjr for (i = 0; i < preg->re_nsub; i++) 1146146040Stjr dfa->subexp_map[i] = i; 1147146040Stjr preorder (dfa->str_tree, optimize_subexps, dfa); 1148146040Stjr for (i = 0; i < preg->re_nsub; i++) 1149146040Stjr if (dfa->subexp_map[i] != i) 1150146040Stjr break; 1151146040Stjr if (i == preg->re_nsub) 1152146040Stjr { 1153146040Stjr free (dfa->subexp_map); 1154146040Stjr dfa->subexp_map = NULL; 1155146040Stjr } 1156146040Stjr } 1157146040Stjr 1158146040Stjr ret = postorder (dfa->str_tree, lower_subexps, preg); 1159146040Stjr if (BE (ret != REG_NOERROR, 0)) 1160146040Stjr return ret; 1161146040Stjr ret = postorder (dfa->str_tree, calc_first, dfa); 1162146040Stjr if (BE (ret != REG_NOERROR, 0)) 1163146040Stjr return ret; 1164146040Stjr preorder (dfa->str_tree, calc_next, dfa); 1165146040Stjr ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1166146040Stjr if (BE (ret != REG_NOERROR, 0)) 1167146040Stjr return ret; 1168146040Stjr ret = calc_eclosure (dfa); 1169146040Stjr if (BE (ret != REG_NOERROR, 0)) 1170146040Stjr return ret; 1171146040Stjr 1172146040Stjr /* We only need this during the prune_impossible_nodes pass in regexec.c; 1173146040Stjr skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1174146040Stjr if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1175146040Stjr || dfa->nbackref) 1176146040Stjr { 1177146040Stjr dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1178146040Stjr if (BE (dfa->inveclosures == NULL, 0)) 1179146040Stjr return REG_ESPACE; 1180146040Stjr ret = calc_inveclosure (dfa); 1181146040Stjr } 1182146040Stjr 1183146040Stjr return ret; 1184146040Stjr} 1185146040Stjr 1186146040Stjr/* Our parse trees are very unbalanced, so we cannot use a stack to 1187146040Stjr implement parse tree visits. Instead, we use parent pointers and 1188146040Stjr some hairy code in these two functions. */ 1189146040Stjrstatic reg_errcode_t 1190146040Stjrpostorder (root, fn, extra) 1191146040Stjr bin_tree_t *root; 1192146040Stjr reg_errcode_t (fn (void *, bin_tree_t *)); 1193146040Stjr void *extra; 1194146040Stjr{ 1195146040Stjr bin_tree_t *node, *prev; 1196146040Stjr 1197146040Stjr for (node = root; ; ) 1198146040Stjr { 1199146040Stjr /* Descend down the tree, preferably to the left (or to the right 1200146040Stjr if that's the only child). */ 1201146040Stjr while (node->left || node->right) 1202146040Stjr if (node->left) 1203146040Stjr node = node->left; 1204146040Stjr else 1205146040Stjr node = node->right; 1206146040Stjr 1207146040Stjr do 1208146040Stjr { 1209146040Stjr reg_errcode_t err = fn (extra, node); 1210146040Stjr if (BE (err != REG_NOERROR, 0)) 1211146040Stjr return err; 1212146040Stjr if (node->parent == NULL) 1213146040Stjr return REG_NOERROR; 1214146040Stjr prev = node; 1215146040Stjr node = node->parent; 1216146040Stjr } 1217146040Stjr /* Go up while we have a node that is reached from the right. */ 1218146040Stjr while (node->right == prev || node->right == NULL); 1219146040Stjr node = node->right; 1220146040Stjr } 1221146040Stjr} 1222146040Stjr 1223146040Stjrstatic reg_errcode_t 1224146040Stjrpreorder (root, fn, extra) 1225146040Stjr bin_tree_t *root; 1226146040Stjr reg_errcode_t (fn (void *, bin_tree_t *)); 1227146040Stjr void *extra; 1228146040Stjr{ 1229146040Stjr bin_tree_t *node; 1230146040Stjr 1231146040Stjr for (node = root; ; ) 1232146040Stjr { 1233146040Stjr reg_errcode_t err = fn (extra, node); 1234146040Stjr if (BE (err != REG_NOERROR, 0)) 1235146040Stjr return err; 1236146040Stjr 1237146040Stjr /* Go to the left node, or up and to the right. */ 1238146040Stjr if (node->left) 1239146040Stjr node = node->left; 1240146040Stjr else 1241146040Stjr { 1242146040Stjr bin_tree_t *prev = NULL; 1243146040Stjr while (node->right == prev || node->right == NULL) 1244146040Stjr { 1245146040Stjr prev = node; 1246146040Stjr node = node->parent; 1247146040Stjr if (!node) 1248146040Stjr return REG_NOERROR; 1249146040Stjr } 1250146040Stjr node = node->right; 1251146040Stjr } 1252146040Stjr } 1253146040Stjr} 1254146040Stjr 1255146040Stjr/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1256146040Stjr re_search_internal to map the inner one's opr.idx to this one's. Adjust 1257146040Stjr backreferences as well. Requires a preorder visit. */ 1258146040Stjrstatic reg_errcode_t 1259146040Stjroptimize_subexps (extra, node) 1260146040Stjr void *extra; 1261146040Stjr bin_tree_t *node; 1262146040Stjr{ 1263146040Stjr re_dfa_t *dfa = (re_dfa_t *) extra; 1264146040Stjr 1265146040Stjr if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1266146040Stjr { 1267146040Stjr int idx = node->token.opr.idx; 1268146040Stjr node->token.opr.idx = dfa->subexp_map[idx]; 1269146040Stjr dfa->used_bkref_map |= 1 << node->token.opr.idx; 1270146040Stjr } 1271146040Stjr 1272146040Stjr else if (node->token.type == SUBEXP 1273146040Stjr && node->left && node->left->token.type == SUBEXP) 1274146040Stjr { 1275146040Stjr int other_idx = node->left->token.opr.idx; 1276146040Stjr 1277146040Stjr node->left = node->left->left; 1278146040Stjr if (node->left) 1279146040Stjr node->left->parent = node; 1280146040Stjr 1281146040Stjr dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1282146040Stjr if (other_idx < 8 * sizeof (dfa->used_bkref_map)) 1283146040Stjr dfa->used_bkref_map &= ~(1 << other_idx); 1284146040Stjr } 1285146040Stjr 1286146040Stjr return REG_NOERROR; 1287146040Stjr} 1288146040Stjr 1289146040Stjr/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1290146040Stjr of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1291146040Stjrstatic reg_errcode_t 1292146040Stjrlower_subexps (extra, node) 1293146040Stjr void *extra; 1294146040Stjr bin_tree_t *node; 1295146040Stjr{ 1296146040Stjr regex_t *preg = (regex_t *) extra; 1297146040Stjr reg_errcode_t err = REG_NOERROR; 1298146040Stjr 1299146040Stjr if (node->left && node->left->token.type == SUBEXP) 1300146040Stjr { 1301146040Stjr node->left = lower_subexp (&err, preg, node->left); 1302146040Stjr if (node->left) 1303146040Stjr node->left->parent = node; 1304146040Stjr } 1305146040Stjr if (node->right && node->right->token.type == SUBEXP) 1306146040Stjr { 1307146040Stjr node->right = lower_subexp (&err, preg, node->right); 1308146040Stjr if (node->right) 1309146040Stjr node->right->parent = node; 1310146040Stjr } 1311146040Stjr 1312146040Stjr return err; 1313146040Stjr} 1314146040Stjr 1315146040Stjrstatic bin_tree_t * 1316146040Stjrlower_subexp (err, preg, node) 1317146040Stjr reg_errcode_t *err; 1318146040Stjr regex_t *preg; 1319146040Stjr bin_tree_t *node; 1320146040Stjr{ 1321146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1322146040Stjr bin_tree_t *body = node->left; 1323146040Stjr bin_tree_t *op, *cls, *tree1, *tree; 1324146040Stjr 1325146040Stjr if (preg->no_sub 1326146040Stjr /* We do not optimize empty subexpressions, because otherwise we may 1327146040Stjr have bad CONCAT nodes with NULL children. This is obviously not 1328146040Stjr very common, so we do not lose much. An example that triggers 1329146040Stjr this case is the sed "script" /\(\)/x. */ 1330146040Stjr && node->left != NULL 1331146040Stjr && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map) 1332146040Stjr || !(dfa->used_bkref_map & (1 << node->token.opr.idx)))) 1333146040Stjr return node->left; 1334146040Stjr 1335146040Stjr /* Convert the SUBEXP node to the concatenation of an 1336146040Stjr OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1337146040Stjr op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1338146040Stjr cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1339146040Stjr tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1340146040Stjr tree = create_tree (dfa, op, tree1, CONCAT); 1341146040Stjr if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1342146040Stjr { 1343146040Stjr *err = REG_ESPACE; 1344146040Stjr return NULL; 1345146040Stjr } 1346146040Stjr 1347146040Stjr op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1348146040Stjr op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1349146040Stjr return tree; 1350146040Stjr} 1351146040Stjr 1352146040Stjr/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1353146040Stjr nodes. Requires a postorder visit. */ 1354146040Stjrstatic reg_errcode_t 1355146040Stjrcalc_first (extra, node) 1356146040Stjr void *extra; 1357146040Stjr bin_tree_t *node; 1358146040Stjr{ 1359146040Stjr re_dfa_t *dfa = (re_dfa_t *) extra; 1360146040Stjr if (node->token.type == CONCAT) 1361146040Stjr { 1362146040Stjr node->first = node->left->first; 1363146040Stjr node->node_idx = node->left->node_idx; 1364146040Stjr } 1365146040Stjr else 1366146040Stjr { 1367146040Stjr node->first = node; 1368146040Stjr node->node_idx = re_dfa_add_node (dfa, node->token); 1369146040Stjr if (BE (node->node_idx == -1, 0)) 1370146040Stjr return REG_ESPACE; 1371146040Stjr } 1372146040Stjr return REG_NOERROR; 1373146040Stjr} 1374146040Stjr 1375146040Stjr/* Pass 2: compute NEXT on the tree. Preorder visit. */ 1376146040Stjrstatic reg_errcode_t 1377146040Stjrcalc_next (extra, node) 1378146040Stjr void *extra; 1379146040Stjr bin_tree_t *node; 1380146040Stjr{ 1381146040Stjr switch (node->token.type) 1382146040Stjr { 1383146040Stjr case OP_DUP_ASTERISK: 1384146040Stjr node->left->next = node; 1385146040Stjr break; 1386146040Stjr case CONCAT: 1387146040Stjr node->left->next = node->right->first; 1388146040Stjr node->right->next = node->next; 1389146040Stjr break; 1390146040Stjr default: 1391146040Stjr if (node->left) 1392146040Stjr node->left->next = node->next; 1393146040Stjr if (node->right) 1394146040Stjr node->right->next = node->next; 1395146040Stjr break; 1396146040Stjr } 1397146040Stjr return REG_NOERROR; 1398146040Stjr} 1399146040Stjr 1400146040Stjr/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1401146040Stjrstatic reg_errcode_t 1402146040Stjrlink_nfa_nodes (extra, node) 1403146040Stjr void *extra; 1404146040Stjr bin_tree_t *node; 1405146040Stjr{ 1406146040Stjr re_dfa_t *dfa = (re_dfa_t *) extra; 1407146040Stjr int idx = node->node_idx; 1408146040Stjr reg_errcode_t err = REG_NOERROR; 1409146040Stjr 1410146040Stjr switch (node->token.type) 1411146040Stjr { 1412146040Stjr case CONCAT: 1413146040Stjr break; 1414146040Stjr 1415146040Stjr case END_OF_RE: 1416146040Stjr assert (node->next == NULL); 1417146040Stjr break; 1418146040Stjr 1419146040Stjr case OP_DUP_ASTERISK: 1420146040Stjr case OP_ALT: 1421146040Stjr { 1422146040Stjr int left, right; 1423146040Stjr dfa->has_plural_match = 1; 1424146040Stjr if (node->left != NULL) 1425146040Stjr left = node->left->first->node_idx; 1426146040Stjr else 1427146040Stjr left = node->next->node_idx; 1428146040Stjr if (node->right != NULL) 1429146040Stjr right = node->right->first->node_idx; 1430146040Stjr else 1431146040Stjr right = node->next->node_idx; 1432146040Stjr assert (left > -1); 1433146040Stjr assert (right > -1); 1434146040Stjr err = re_node_set_init_2 (dfa->edests + idx, left, right); 1435146040Stjr } 1436146040Stjr break; 1437146040Stjr 1438146040Stjr case ANCHOR: 1439146040Stjr case OP_OPEN_SUBEXP: 1440146040Stjr case OP_CLOSE_SUBEXP: 1441146040Stjr err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1442146040Stjr break; 1443146040Stjr 1444146040Stjr case OP_BACK_REF: 1445146040Stjr dfa->nexts[idx] = node->next->node_idx; 1446146040Stjr if (node->token.type == OP_BACK_REF) 1447146040Stjr re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1448146040Stjr break; 1449146040Stjr 1450146040Stjr default: 1451146040Stjr assert (!IS_EPSILON_NODE (node->token.type)); 1452146040Stjr dfa->nexts[idx] = node->next->node_idx; 1453146040Stjr break; 1454146040Stjr } 1455146040Stjr 1456146040Stjr return err; 1457146040Stjr} 1458146040Stjr 1459146040Stjr/* Duplicate the epsilon closure of the node ROOT_NODE. 1460146040Stjr Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1461146040Stjr to their own constraint. */ 1462146040Stjr 1463146040Stjrstatic reg_errcode_t 1464146040Stjrduplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, 1465146040Stjr init_constraint) 1466146040Stjr re_dfa_t *dfa; 1467146040Stjr int top_org_node, top_clone_node, root_node; 1468146040Stjr unsigned int init_constraint; 1469146040Stjr{ 1470146040Stjr reg_errcode_t err; 1471146040Stjr int org_node, clone_node, ret; 1472146040Stjr unsigned int constraint = init_constraint; 1473146040Stjr for (org_node = top_org_node, clone_node = top_clone_node;;) 1474146040Stjr { 1475146040Stjr int org_dest, clone_dest; 1476146040Stjr if (dfa->nodes[org_node].type == OP_BACK_REF) 1477146040Stjr { 1478146040Stjr /* If the back reference epsilon-transit, its destination must 1479146040Stjr also have the constraint. Then duplicate the epsilon closure 1480146040Stjr of the destination of the back reference, and store it in 1481146040Stjr edests of the back reference. */ 1482146040Stjr org_dest = dfa->nexts[org_node]; 1483146040Stjr re_node_set_empty (dfa->edests + clone_node); 1484146040Stjr err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1485146040Stjr if (BE (err != REG_NOERROR, 0)) 1486146040Stjr return err; 1487146040Stjr dfa->nexts[clone_node] = dfa->nexts[org_node]; 1488146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1489146040Stjr if (BE (ret < 0, 0)) 1490146040Stjr return REG_ESPACE; 1491146040Stjr } 1492146040Stjr else if (dfa->edests[org_node].nelem == 0) 1493146040Stjr { 1494146040Stjr /* In case of the node can't epsilon-transit, don't duplicate the 1495146040Stjr destination and store the original destination as the 1496146040Stjr destination of the node. */ 1497146040Stjr dfa->nexts[clone_node] = dfa->nexts[org_node]; 1498146040Stjr break; 1499146040Stjr } 1500146040Stjr else if (dfa->edests[org_node].nelem == 1) 1501146040Stjr { 1502146040Stjr /* In case of the node can epsilon-transit, and it has only one 1503146040Stjr destination. */ 1504146040Stjr org_dest = dfa->edests[org_node].elems[0]; 1505146040Stjr re_node_set_empty (dfa->edests + clone_node); 1506146040Stjr if (dfa->nodes[org_node].type == ANCHOR) 1507146040Stjr { 1508146040Stjr /* In case of the node has another constraint, append it. */ 1509146040Stjr if (org_node == root_node && clone_node != org_node) 1510146040Stjr { 1511146040Stjr /* ...but if the node is root_node itself, it means the 1512146040Stjr epsilon closure have a loop, then tie it to the 1513146040Stjr destination of the root_node. */ 1514146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, 1515146040Stjr org_dest); 1516146040Stjr if (BE (ret < 0, 0)) 1517146040Stjr return REG_ESPACE; 1518146040Stjr break; 1519146040Stjr } 1520146040Stjr constraint |= dfa->nodes[org_node].opr.ctx_type; 1521146040Stjr } 1522146040Stjr err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1523146040Stjr if (BE (err != REG_NOERROR, 0)) 1524146040Stjr return err; 1525146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1526146040Stjr if (BE (ret < 0, 0)) 1527146040Stjr return REG_ESPACE; 1528146040Stjr } 1529146040Stjr else /* dfa->edests[org_node].nelem == 2 */ 1530146040Stjr { 1531146040Stjr /* In case of the node can epsilon-transit, and it has two 1532146040Stjr destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1533146040Stjr org_dest = dfa->edests[org_node].elems[0]; 1534146040Stjr re_node_set_empty (dfa->edests + clone_node); 1535146040Stjr /* Search for a duplicated node which satisfies the constraint. */ 1536146040Stjr clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1537146040Stjr if (clone_dest == -1) 1538146040Stjr { 1539146040Stjr /* There are no such a duplicated node, create a new one. */ 1540146040Stjr err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1541146040Stjr if (BE (err != REG_NOERROR, 0)) 1542146040Stjr return err; 1543146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1544146040Stjr if (BE (ret < 0, 0)) 1545146040Stjr return REG_ESPACE; 1546146040Stjr err = duplicate_node_closure (dfa, org_dest, clone_dest, 1547146040Stjr root_node, constraint); 1548146040Stjr if (BE (err != REG_NOERROR, 0)) 1549146040Stjr return err; 1550146040Stjr } 1551146040Stjr else 1552146040Stjr { 1553146040Stjr /* There are a duplicated node which satisfy the constraint, 1554146040Stjr use it to avoid infinite loop. */ 1555146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1556146040Stjr if (BE (ret < 0, 0)) 1557146040Stjr return REG_ESPACE; 1558146040Stjr } 1559146040Stjr 1560146040Stjr org_dest = dfa->edests[org_node].elems[1]; 1561146040Stjr err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1562146040Stjr if (BE (err != REG_NOERROR, 0)) 1563146040Stjr return err; 1564146040Stjr ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1565146040Stjr if (BE (ret < 0, 0)) 1566146040Stjr return REG_ESPACE; 1567146040Stjr } 1568146040Stjr org_node = org_dest; 1569146040Stjr clone_node = clone_dest; 1570146040Stjr } 1571146040Stjr return REG_NOERROR; 1572146040Stjr} 1573146040Stjr 1574146040Stjr/* Search for a node which is duplicated from the node ORG_NODE, and 1575146040Stjr satisfies the constraint CONSTRAINT. */ 1576146040Stjr 1577146040Stjrstatic int 1578146040Stjrsearch_duplicated_node (dfa, org_node, constraint) 1579146040Stjr re_dfa_t *dfa; 1580146040Stjr int org_node; 1581146040Stjr unsigned int constraint; 1582146040Stjr{ 1583146040Stjr int idx; 1584146040Stjr for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1585146040Stjr { 1586146040Stjr if (org_node == dfa->org_indices[idx] 1587146040Stjr && constraint == dfa->nodes[idx].constraint) 1588146040Stjr return idx; /* Found. */ 1589146040Stjr } 1590146040Stjr return -1; /* Not found. */ 1591146040Stjr} 1592146040Stjr 1593146040Stjr/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1594146040Stjr The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded, 1595146040Stjr otherwise return the error code. */ 1596146040Stjr 1597146040Stjrstatic reg_errcode_t 1598146040Stjrduplicate_node (new_idx, dfa, org_idx, constraint) 1599146040Stjr re_dfa_t *dfa; 1600146040Stjr int *new_idx, org_idx; 1601146040Stjr unsigned int constraint; 1602146040Stjr{ 1603146040Stjr int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1604146040Stjr if (BE (dup_idx == -1, 0)) 1605146040Stjr return REG_ESPACE; 1606146040Stjr dfa->nodes[dup_idx].constraint = constraint; 1607146040Stjr if (dfa->nodes[org_idx].type == ANCHOR) 1608146040Stjr dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; 1609146040Stjr dfa->nodes[dup_idx].duplicated = 1; 1610146040Stjr 1611146040Stjr /* Store the index of the original node. */ 1612146040Stjr dfa->org_indices[dup_idx] = org_idx; 1613146040Stjr *new_idx = dup_idx; 1614146040Stjr return REG_NOERROR; 1615146040Stjr} 1616146040Stjr 1617146040Stjrstatic reg_errcode_t 1618146040Stjrcalc_inveclosure (dfa) 1619146040Stjr re_dfa_t *dfa; 1620146040Stjr{ 1621146040Stjr int src, idx, ret; 1622146040Stjr for (idx = 0; idx < dfa->nodes_len; ++idx) 1623146040Stjr re_node_set_init_empty (dfa->inveclosures + idx); 1624146040Stjr 1625146040Stjr for (src = 0; src < dfa->nodes_len; ++src) 1626146040Stjr { 1627146040Stjr int *elems = dfa->eclosures[src].elems; 1628146040Stjr for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1629146040Stjr { 1630146040Stjr ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1631146040Stjr if (BE (ret == -1, 0)) 1632146040Stjr return REG_ESPACE; 1633146040Stjr } 1634146040Stjr } 1635146040Stjr 1636146040Stjr return REG_NOERROR; 1637146040Stjr} 1638146040Stjr 1639146040Stjr/* Calculate "eclosure" for all the node in DFA. */ 1640146040Stjr 1641146040Stjrstatic reg_errcode_t 1642146040Stjrcalc_eclosure (dfa) 1643146040Stjr re_dfa_t *dfa; 1644146040Stjr{ 1645146040Stjr int node_idx, incomplete; 1646146040Stjr#ifdef DEBUG 1647146040Stjr assert (dfa->nodes_len > 0); 1648146040Stjr#endif 1649146040Stjr incomplete = 0; 1650146040Stjr /* For each nodes, calculate epsilon closure. */ 1651146040Stjr for (node_idx = 0; ; ++node_idx) 1652146040Stjr { 1653146040Stjr reg_errcode_t err; 1654146040Stjr re_node_set eclosure_elem; 1655146040Stjr if (node_idx == dfa->nodes_len) 1656146040Stjr { 1657146040Stjr if (!incomplete) 1658146040Stjr break; 1659146040Stjr incomplete = 0; 1660146040Stjr node_idx = 0; 1661146040Stjr } 1662146040Stjr 1663146040Stjr#ifdef DEBUG 1664146040Stjr assert (dfa->eclosures[node_idx].nelem != -1); 1665146040Stjr#endif 1666146040Stjr 1667146040Stjr /* If we have already calculated, skip it. */ 1668146040Stjr if (dfa->eclosures[node_idx].nelem != 0) 1669146040Stjr continue; 1670146040Stjr /* Calculate epsilon closure of `node_idx'. */ 1671146040Stjr err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1); 1672146040Stjr if (BE (err != REG_NOERROR, 0)) 1673146040Stjr return err; 1674146040Stjr 1675146040Stjr if (dfa->eclosures[node_idx].nelem == 0) 1676146040Stjr { 1677146040Stjr incomplete = 1; 1678146040Stjr re_node_set_free (&eclosure_elem); 1679146040Stjr } 1680146040Stjr } 1681146040Stjr return REG_NOERROR; 1682146040Stjr} 1683146040Stjr 1684146040Stjr/* Calculate epsilon closure of NODE. */ 1685146040Stjr 1686146040Stjrstatic reg_errcode_t 1687146040Stjrcalc_eclosure_iter (new_set, dfa, node, root) 1688146040Stjr re_node_set *new_set; 1689146040Stjr re_dfa_t *dfa; 1690146040Stjr int node, root; 1691146040Stjr{ 1692146040Stjr reg_errcode_t err; 1693146040Stjr unsigned int constraint; 1694146040Stjr int i, incomplete; 1695146040Stjr re_node_set eclosure; 1696146040Stjr incomplete = 0; 1697146040Stjr err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1698146040Stjr if (BE (err != REG_NOERROR, 0)) 1699146040Stjr return err; 1700146040Stjr 1701146040Stjr /* This indicates that we are calculating this node now. 1702146040Stjr We reference this value to avoid infinite loop. */ 1703146040Stjr dfa->eclosures[node].nelem = -1; 1704146040Stjr 1705146040Stjr constraint = ((dfa->nodes[node].type == ANCHOR) 1706146040Stjr ? dfa->nodes[node].opr.ctx_type : 0); 1707146040Stjr /* If the current node has constraints, duplicate all nodes. 1708146040Stjr Since they must inherit the constraints. */ 1709146040Stjr if (constraint 1710146040Stjr && dfa->edests[node].nelem 1711146040Stjr && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1712146040Stjr { 1713146040Stjr int org_node, cur_node; 1714146040Stjr org_node = cur_node = node; 1715146040Stjr err = duplicate_node_closure (dfa, node, node, node, constraint); 1716146040Stjr if (BE (err != REG_NOERROR, 0)) 1717146040Stjr return err; 1718146040Stjr } 1719146040Stjr 1720146040Stjr /* Expand each epsilon destination nodes. */ 1721146040Stjr if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1722146040Stjr for (i = 0; i < dfa->edests[node].nelem; ++i) 1723146040Stjr { 1724146040Stjr re_node_set eclosure_elem; 1725146040Stjr int edest = dfa->edests[node].elems[i]; 1726146040Stjr /* If calculating the epsilon closure of `edest' is in progress, 1727146040Stjr return intermediate result. */ 1728146040Stjr if (dfa->eclosures[edest].nelem == -1) 1729146040Stjr { 1730146040Stjr incomplete = 1; 1731146040Stjr continue; 1732146040Stjr } 1733146040Stjr /* If we haven't calculated the epsilon closure of `edest' yet, 1734146040Stjr calculate now. Otherwise use calculated epsilon closure. */ 1735146040Stjr if (dfa->eclosures[edest].nelem == 0) 1736146040Stjr { 1737146040Stjr err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0); 1738146040Stjr if (BE (err != REG_NOERROR, 0)) 1739146040Stjr return err; 1740146040Stjr } 1741146040Stjr else 1742146040Stjr eclosure_elem = dfa->eclosures[edest]; 1743146040Stjr /* Merge the epsilon closure of `edest'. */ 1744146040Stjr re_node_set_merge (&eclosure, &eclosure_elem); 1745146040Stjr /* If the epsilon closure of `edest' is incomplete, 1746146040Stjr the epsilon closure of this node is also incomplete. */ 1747146040Stjr if (dfa->eclosures[edest].nelem == 0) 1748146040Stjr { 1749146040Stjr incomplete = 1; 1750146040Stjr re_node_set_free (&eclosure_elem); 1751146040Stjr } 1752146040Stjr } 1753146040Stjr 1754146040Stjr /* Epsilon closures include itself. */ 1755146040Stjr re_node_set_insert (&eclosure, node); 1756146040Stjr if (incomplete && !root) 1757146040Stjr dfa->eclosures[node].nelem = 0; 1758146040Stjr else 1759146040Stjr dfa->eclosures[node] = eclosure; 1760146040Stjr *new_set = eclosure; 1761146040Stjr return REG_NOERROR; 1762146040Stjr} 1763146040Stjr 1764146040Stjr/* Functions for token which are used in the parser. */ 1765146040Stjr 1766146040Stjr/* Fetch a token from INPUT. 1767146040Stjr We must not use this function inside bracket expressions. */ 1768146040Stjr 1769146040Stjrstatic void 1770146040Stjrfetch_token (result, input, syntax) 1771146040Stjr re_token_t *result; 1772146040Stjr re_string_t *input; 1773146040Stjr reg_syntax_t syntax; 1774146040Stjr{ 1775146040Stjr re_string_skip_bytes (input, peek_token (result, input, syntax)); 1776146040Stjr} 1777146040Stjr 1778146040Stjr/* Peek a token from INPUT, and return the length of the token. 1779146040Stjr We must not use this function inside bracket expressions. */ 1780146040Stjr 1781146040Stjrstatic int 1782146040Stjrpeek_token (token, input, syntax) 1783146040Stjr re_token_t *token; 1784146040Stjr re_string_t *input; 1785146040Stjr reg_syntax_t syntax; 1786146040Stjr{ 1787146040Stjr unsigned char c; 1788146040Stjr 1789146040Stjr if (re_string_eoi (input)) 1790146040Stjr { 1791146040Stjr token->type = END_OF_RE; 1792146040Stjr return 0; 1793146040Stjr } 1794146040Stjr 1795146040Stjr c = re_string_peek_byte (input, 0); 1796146040Stjr token->opr.c = c; 1797146040Stjr 1798146040Stjr token->word_char = 0; 1799146040Stjr#ifdef RE_ENABLE_I18N 1800146040Stjr token->mb_partial = 0; 1801146040Stjr if (input->mb_cur_max > 1 && 1802146040Stjr !re_string_first_byte (input, re_string_cur_idx (input))) 1803146040Stjr { 1804146040Stjr token->type = CHARACTER; 1805146040Stjr token->mb_partial = 1; 1806146040Stjr return 1; 1807146040Stjr } 1808146040Stjr#endif 1809146040Stjr if (c == '\\') 1810146040Stjr { 1811146040Stjr unsigned char c2; 1812146040Stjr if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1813146040Stjr { 1814146040Stjr token->type = BACK_SLASH; 1815146040Stjr return 1; 1816146040Stjr } 1817146040Stjr 1818146040Stjr c2 = re_string_peek_byte_case (input, 1); 1819146040Stjr token->opr.c = c2; 1820146040Stjr token->type = CHARACTER; 1821146040Stjr#ifdef RE_ENABLE_I18N 1822146040Stjr if (input->mb_cur_max > 1) 1823146040Stjr { 1824146040Stjr wint_t wc = re_string_wchar_at (input, 1825146040Stjr re_string_cur_idx (input) + 1); 1826146040Stjr token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1827146040Stjr } 1828146040Stjr else 1829146040Stjr#endif 1830146040Stjr token->word_char = IS_WORD_CHAR (c2) != 0; 1831146040Stjr 1832146040Stjr switch (c2) 1833146040Stjr { 1834146040Stjr case '|': 1835146040Stjr if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1836146040Stjr token->type = OP_ALT; 1837146040Stjr break; 1838146040Stjr case '1': case '2': case '3': case '4': case '5': 1839146040Stjr case '6': case '7': case '8': case '9': 1840146040Stjr if (!(syntax & RE_NO_BK_REFS)) 1841146040Stjr { 1842146040Stjr token->type = OP_BACK_REF; 1843146040Stjr token->opr.idx = c2 - '1'; 1844146040Stjr } 1845146040Stjr break; 1846146040Stjr case '<': 1847146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1848146040Stjr { 1849146040Stjr token->type = ANCHOR; 1850146040Stjr token->opr.ctx_type = WORD_FIRST; 1851146040Stjr } 1852146040Stjr break; 1853146040Stjr case '>': 1854146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1855146040Stjr { 1856146040Stjr token->type = ANCHOR; 1857146040Stjr token->opr.ctx_type = WORD_LAST; 1858146040Stjr } 1859146040Stjr break; 1860146040Stjr case 'b': 1861146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1862146040Stjr { 1863146040Stjr token->type = ANCHOR; 1864146040Stjr token->opr.ctx_type = WORD_DELIM; 1865146040Stjr } 1866146040Stjr break; 1867146040Stjr case 'B': 1868146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1869146040Stjr { 1870146040Stjr token->type = ANCHOR; 1871146040Stjr token->opr.ctx_type = NOT_WORD_DELIM; 1872146040Stjr } 1873146040Stjr break; 1874146040Stjr case 'w': 1875146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1876146040Stjr token->type = OP_WORD; 1877146040Stjr break; 1878146040Stjr case 'W': 1879146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1880146040Stjr token->type = OP_NOTWORD; 1881146040Stjr break; 1882146040Stjr case 's': 1883146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1884146040Stjr token->type = OP_SPACE; 1885146040Stjr break; 1886146040Stjr case 'S': 1887146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1888146040Stjr token->type = OP_NOTSPACE; 1889146040Stjr break; 1890146040Stjr case '`': 1891146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1892146040Stjr { 1893146040Stjr token->type = ANCHOR; 1894146040Stjr token->opr.ctx_type = BUF_FIRST; 1895146040Stjr } 1896146040Stjr break; 1897146040Stjr case '\'': 1898146040Stjr if (!(syntax & RE_NO_GNU_OPS)) 1899146040Stjr { 1900146040Stjr token->type = ANCHOR; 1901146040Stjr token->opr.ctx_type = BUF_LAST; 1902146040Stjr } 1903146040Stjr break; 1904146040Stjr case '(': 1905146040Stjr if (!(syntax & RE_NO_BK_PARENS)) 1906146040Stjr token->type = OP_OPEN_SUBEXP; 1907146040Stjr break; 1908146040Stjr case ')': 1909146040Stjr if (!(syntax & RE_NO_BK_PARENS)) 1910146040Stjr token->type = OP_CLOSE_SUBEXP; 1911146040Stjr break; 1912146040Stjr case '+': 1913146040Stjr if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1914146040Stjr token->type = OP_DUP_PLUS; 1915146040Stjr break; 1916146040Stjr case '?': 1917146040Stjr if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1918146040Stjr token->type = OP_DUP_QUESTION; 1919146040Stjr break; 1920146040Stjr case '{': 1921146040Stjr if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1922146040Stjr token->type = OP_OPEN_DUP_NUM; 1923146040Stjr break; 1924146040Stjr case '}': 1925146040Stjr if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1926146040Stjr token->type = OP_CLOSE_DUP_NUM; 1927146040Stjr break; 1928146040Stjr default: 1929146040Stjr break; 1930146040Stjr } 1931146040Stjr return 2; 1932146040Stjr } 1933146040Stjr 1934146040Stjr token->type = CHARACTER; 1935146040Stjr#ifdef RE_ENABLE_I18N 1936146040Stjr if (input->mb_cur_max > 1) 1937146040Stjr { 1938146040Stjr wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1939146040Stjr token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1940146040Stjr } 1941146040Stjr else 1942146040Stjr#endif 1943146040Stjr token->word_char = IS_WORD_CHAR (token->opr.c); 1944146040Stjr 1945146040Stjr switch (c) 1946146040Stjr { 1947146040Stjr case '\n': 1948146040Stjr if (syntax & RE_NEWLINE_ALT) 1949146040Stjr token->type = OP_ALT; 1950146040Stjr break; 1951146040Stjr case '|': 1952146040Stjr if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1953146040Stjr token->type = OP_ALT; 1954146040Stjr break; 1955146040Stjr case '*': 1956146040Stjr token->type = OP_DUP_ASTERISK; 1957146040Stjr break; 1958146040Stjr case '+': 1959146040Stjr if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1960146040Stjr token->type = OP_DUP_PLUS; 1961146040Stjr break; 1962146040Stjr case '?': 1963146040Stjr if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1964146040Stjr token->type = OP_DUP_QUESTION; 1965146040Stjr break; 1966146040Stjr case '{': 1967146040Stjr if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1968146040Stjr token->type = OP_OPEN_DUP_NUM; 1969146040Stjr break; 1970146040Stjr case '}': 1971146040Stjr if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1972146040Stjr token->type = OP_CLOSE_DUP_NUM; 1973146040Stjr break; 1974146040Stjr case '(': 1975146040Stjr if (syntax & RE_NO_BK_PARENS) 1976146040Stjr token->type = OP_OPEN_SUBEXP; 1977146040Stjr break; 1978146040Stjr case ')': 1979146040Stjr if (syntax & RE_NO_BK_PARENS) 1980146040Stjr token->type = OP_CLOSE_SUBEXP; 1981146040Stjr break; 1982146040Stjr case '[': 1983146040Stjr token->type = OP_OPEN_BRACKET; 1984146040Stjr break; 1985146040Stjr case '.': 1986146040Stjr token->type = OP_PERIOD; 1987146040Stjr break; 1988146040Stjr case '^': 1989146040Stjr if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 1990146040Stjr re_string_cur_idx (input) != 0) 1991146040Stjr { 1992146040Stjr char prev = re_string_peek_byte (input, -1); 1993146040Stjr if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 1994146040Stjr break; 1995146040Stjr } 1996146040Stjr token->type = ANCHOR; 1997146040Stjr token->opr.ctx_type = LINE_FIRST; 1998146040Stjr break; 1999146040Stjr case '$': 2000146040Stjr if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 2001146040Stjr re_string_cur_idx (input) + 1 != re_string_length (input)) 2002146040Stjr { 2003146040Stjr re_token_t next; 2004146040Stjr re_string_skip_bytes (input, 1); 2005146040Stjr peek_token (&next, input, syntax); 2006146040Stjr re_string_skip_bytes (input, -1); 2007146040Stjr if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 2008146040Stjr break; 2009146040Stjr } 2010146040Stjr token->type = ANCHOR; 2011146040Stjr token->opr.ctx_type = LINE_LAST; 2012146040Stjr break; 2013146040Stjr default: 2014146040Stjr break; 2015146040Stjr } 2016146040Stjr return 1; 2017146040Stjr} 2018146040Stjr 2019146040Stjr/* Peek a token from INPUT, and return the length of the token. 2020146040Stjr We must not use this function out of bracket expressions. */ 2021146040Stjr 2022146040Stjrstatic int 2023146040Stjrpeek_token_bracket (token, input, syntax) 2024146040Stjr re_token_t *token; 2025146040Stjr re_string_t *input; 2026146040Stjr reg_syntax_t syntax; 2027146040Stjr{ 2028146040Stjr unsigned char c; 2029146040Stjr if (re_string_eoi (input)) 2030146040Stjr { 2031146040Stjr token->type = END_OF_RE; 2032146040Stjr return 0; 2033146040Stjr } 2034146040Stjr c = re_string_peek_byte (input, 0); 2035146040Stjr token->opr.c = c; 2036146040Stjr 2037146040Stjr#ifdef RE_ENABLE_I18N 2038146040Stjr if (input->mb_cur_max > 1 && 2039146040Stjr !re_string_first_byte (input, re_string_cur_idx (input))) 2040146040Stjr { 2041146040Stjr token->type = CHARACTER; 2042146040Stjr return 1; 2043146040Stjr } 2044146040Stjr#endif /* RE_ENABLE_I18N */ 2045146040Stjr 2046146040Stjr if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2047146040Stjr && re_string_cur_idx (input) + 1 < re_string_length (input)) 2048146040Stjr { 2049146040Stjr /* In this case, '\' escape a character. */ 2050146040Stjr unsigned char c2; 2051146040Stjr re_string_skip_bytes (input, 1); 2052146040Stjr c2 = re_string_peek_byte (input, 0); 2053146040Stjr token->opr.c = c2; 2054146040Stjr token->type = CHARACTER; 2055146040Stjr return 1; 2056146040Stjr } 2057146040Stjr if (c == '[') /* '[' is a special char in a bracket exps. */ 2058146040Stjr { 2059146040Stjr unsigned char c2; 2060146040Stjr int token_len; 2061146040Stjr if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2062146040Stjr c2 = re_string_peek_byte (input, 1); 2063146040Stjr else 2064146040Stjr c2 = 0; 2065146040Stjr token->opr.c = c2; 2066146040Stjr token_len = 2; 2067146040Stjr switch (c2) 2068146040Stjr { 2069146040Stjr case '.': 2070146040Stjr token->type = OP_OPEN_COLL_ELEM; 2071146040Stjr break; 2072146040Stjr case '=': 2073146040Stjr token->type = OP_OPEN_EQUIV_CLASS; 2074146040Stjr break; 2075146040Stjr case ':': 2076146040Stjr if (syntax & RE_CHAR_CLASSES) 2077146040Stjr { 2078146040Stjr token->type = OP_OPEN_CHAR_CLASS; 2079146040Stjr break; 2080146040Stjr } 2081146040Stjr /* else fall through. */ 2082146040Stjr default: 2083146040Stjr token->type = CHARACTER; 2084146040Stjr token->opr.c = c; 2085146040Stjr token_len = 1; 2086146040Stjr break; 2087146040Stjr } 2088146040Stjr return token_len; 2089146040Stjr } 2090146040Stjr switch (c) 2091146040Stjr { 2092146040Stjr case '-': 2093146040Stjr token->type = OP_CHARSET_RANGE; 2094146040Stjr break; 2095146040Stjr case ']': 2096146040Stjr token->type = OP_CLOSE_BRACKET; 2097146040Stjr break; 2098146040Stjr case '^': 2099146040Stjr token->type = OP_NON_MATCH_LIST; 2100146040Stjr break; 2101146040Stjr default: 2102146040Stjr token->type = CHARACTER; 2103146040Stjr } 2104146040Stjr return 1; 2105146040Stjr} 2106146040Stjr 2107146040Stjr/* Functions for parser. */ 2108146040Stjr 2109146040Stjr/* Entry point of the parser. 2110146040Stjr Parse the regular expression REGEXP and return the structure tree. 2111146040Stjr If an error is occured, ERR is set by error code, and return NULL. 2112146040Stjr This function build the following tree, from regular expression <reg_exp>: 2113146040Stjr CAT 2114146040Stjr / \ 2115146040Stjr / \ 2116146040Stjr <reg_exp> EOR 2117146040Stjr 2118146040Stjr CAT means concatenation. 2119146040Stjr EOR means end of regular expression. */ 2120146040Stjr 2121146040Stjrstatic bin_tree_t * 2122146040Stjrparse (regexp, preg, syntax, err) 2123146040Stjr re_string_t *regexp; 2124146040Stjr regex_t *preg; 2125146040Stjr reg_syntax_t syntax; 2126146040Stjr reg_errcode_t *err; 2127146040Stjr{ 2128146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2129146040Stjr bin_tree_t *tree, *eor, *root; 2130146040Stjr re_token_t current_token; 2131146040Stjr dfa->syntax = syntax; 2132146040Stjr fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2133146040Stjr tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2134146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2135146040Stjr return NULL; 2136146040Stjr eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2137146040Stjr if (tree != NULL) 2138146040Stjr root = create_tree (dfa, tree, eor, CONCAT); 2139146040Stjr else 2140146040Stjr root = eor; 2141146040Stjr if (BE (eor == NULL || root == NULL, 0)) 2142146040Stjr { 2143146040Stjr *err = REG_ESPACE; 2144146040Stjr return NULL; 2145146040Stjr } 2146146040Stjr return root; 2147146040Stjr} 2148146040Stjr 2149146040Stjr/* This function build the following tree, from regular expression 2150146040Stjr <branch1>|<branch2>: 2151146040Stjr ALT 2152146040Stjr / \ 2153146040Stjr / \ 2154146040Stjr <branch1> <branch2> 2155146040Stjr 2156146040Stjr ALT means alternative, which represents the operator `|'. */ 2157146040Stjr 2158146040Stjrstatic bin_tree_t * 2159146040Stjrparse_reg_exp (regexp, preg, token, syntax, nest, err) 2160146040Stjr re_string_t *regexp; 2161146040Stjr regex_t *preg; 2162146040Stjr re_token_t *token; 2163146040Stjr reg_syntax_t syntax; 2164146040Stjr int nest; 2165146040Stjr reg_errcode_t *err; 2166146040Stjr{ 2167146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2168146040Stjr bin_tree_t *tree, *branch = NULL; 2169146040Stjr tree = parse_branch (regexp, preg, token, syntax, nest, err); 2170146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2171146040Stjr return NULL; 2172146040Stjr 2173146040Stjr while (token->type == OP_ALT) 2174146040Stjr { 2175146040Stjr fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2176146040Stjr if (token->type != OP_ALT && token->type != END_OF_RE 2177146040Stjr && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2178146040Stjr { 2179146040Stjr branch = parse_branch (regexp, preg, token, syntax, nest, err); 2180146040Stjr if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2181146040Stjr return NULL; 2182146040Stjr } 2183146040Stjr else 2184146040Stjr branch = NULL; 2185146040Stjr tree = create_tree (dfa, tree, branch, OP_ALT); 2186146040Stjr if (BE (tree == NULL, 0)) 2187146040Stjr { 2188146040Stjr *err = REG_ESPACE; 2189146040Stjr return NULL; 2190146040Stjr } 2191146040Stjr } 2192146040Stjr return tree; 2193146040Stjr} 2194146040Stjr 2195146040Stjr/* This function build the following tree, from regular expression 2196146040Stjr <exp1><exp2>: 2197146040Stjr CAT 2198146040Stjr / \ 2199146040Stjr / \ 2200146040Stjr <exp1> <exp2> 2201146040Stjr 2202146040Stjr CAT means concatenation. */ 2203146040Stjr 2204146040Stjrstatic bin_tree_t * 2205146040Stjrparse_branch (regexp, preg, token, syntax, nest, err) 2206146040Stjr re_string_t *regexp; 2207146040Stjr regex_t *preg; 2208146040Stjr re_token_t *token; 2209146040Stjr reg_syntax_t syntax; 2210146040Stjr int nest; 2211146040Stjr reg_errcode_t *err; 2212146040Stjr{ 2213146040Stjr bin_tree_t *tree, *exp; 2214146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2215146040Stjr tree = parse_expression (regexp, preg, token, syntax, nest, err); 2216146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2217146040Stjr return NULL; 2218146040Stjr 2219146040Stjr while (token->type != OP_ALT && token->type != END_OF_RE 2220146040Stjr && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2221146040Stjr { 2222146040Stjr exp = parse_expression (regexp, preg, token, syntax, nest, err); 2223146040Stjr if (BE (*err != REG_NOERROR && exp == NULL, 0)) 2224146040Stjr { 2225146040Stjr return NULL; 2226146040Stjr } 2227146040Stjr if (tree != NULL && exp != NULL) 2228146040Stjr { 2229146040Stjr tree = create_tree (dfa, tree, exp, CONCAT); 2230146040Stjr if (tree == NULL) 2231146040Stjr { 2232146040Stjr *err = REG_ESPACE; 2233146040Stjr return NULL; 2234146040Stjr } 2235146040Stjr } 2236146040Stjr else if (tree == NULL) 2237146040Stjr tree = exp; 2238146040Stjr /* Otherwise exp == NULL, we don't need to create new tree. */ 2239146040Stjr } 2240146040Stjr return tree; 2241146040Stjr} 2242146040Stjr 2243146040Stjr/* This function build the following tree, from regular expression a*: 2244146040Stjr * 2245146040Stjr | 2246146040Stjr a 2247146040Stjr*/ 2248146040Stjr 2249146040Stjrstatic bin_tree_t * 2250146040Stjrparse_expression (regexp, preg, token, syntax, nest, err) 2251146040Stjr re_string_t *regexp; 2252146040Stjr regex_t *preg; 2253146040Stjr re_token_t *token; 2254146040Stjr reg_syntax_t syntax; 2255146040Stjr int nest; 2256146040Stjr reg_errcode_t *err; 2257146040Stjr{ 2258146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2259146040Stjr bin_tree_t *tree; 2260146040Stjr switch (token->type) 2261146040Stjr { 2262146040Stjr case CHARACTER: 2263146040Stjr tree = create_token_tree (dfa, NULL, NULL, token); 2264146040Stjr if (BE (tree == NULL, 0)) 2265146040Stjr { 2266146040Stjr *err = REG_ESPACE; 2267146040Stjr return NULL; 2268146040Stjr } 2269146040Stjr#ifdef RE_ENABLE_I18N 2270146040Stjr if (dfa->mb_cur_max > 1) 2271146040Stjr { 2272146040Stjr while (!re_string_eoi (regexp) 2273146040Stjr && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2274146040Stjr { 2275146040Stjr bin_tree_t *mbc_remain; 2276146040Stjr fetch_token (token, regexp, syntax); 2277146040Stjr mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2278146040Stjr tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2279146040Stjr if (BE (mbc_remain == NULL || tree == NULL, 0)) 2280146040Stjr { 2281146040Stjr *err = REG_ESPACE; 2282146040Stjr return NULL; 2283146040Stjr } 2284146040Stjr } 2285146040Stjr } 2286146040Stjr#endif 2287146040Stjr break; 2288146040Stjr case OP_OPEN_SUBEXP: 2289146040Stjr tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2290146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2291146040Stjr return NULL; 2292146040Stjr break; 2293146040Stjr case OP_OPEN_BRACKET: 2294146040Stjr tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2295146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2296146040Stjr return NULL; 2297146040Stjr break; 2298146040Stjr case OP_BACK_REF: 2299146040Stjr if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2300146040Stjr { 2301146040Stjr *err = REG_ESUBREG; 2302146040Stjr return NULL; 2303146040Stjr } 2304146040Stjr dfa->used_bkref_map |= 1 << token->opr.idx; 2305146040Stjr tree = create_token_tree (dfa, NULL, NULL, token); 2306146040Stjr if (BE (tree == NULL, 0)) 2307146040Stjr { 2308146040Stjr *err = REG_ESPACE; 2309146040Stjr return NULL; 2310146040Stjr } 2311146040Stjr ++dfa->nbackref; 2312146040Stjr dfa->has_mb_node = 1; 2313146040Stjr break; 2314146040Stjr case OP_OPEN_DUP_NUM: 2315146040Stjr if (syntax & RE_CONTEXT_INVALID_DUP) 2316146040Stjr { 2317146040Stjr *err = REG_BADRPT; 2318146040Stjr return NULL; 2319146040Stjr } 2320146040Stjr /* FALLTHROUGH */ 2321146040Stjr case OP_DUP_ASTERISK: 2322146040Stjr case OP_DUP_PLUS: 2323146040Stjr case OP_DUP_QUESTION: 2324146040Stjr if (syntax & RE_CONTEXT_INVALID_OPS) 2325146040Stjr { 2326146040Stjr *err = REG_BADRPT; 2327146040Stjr return NULL; 2328146040Stjr } 2329146040Stjr else if (syntax & RE_CONTEXT_INDEP_OPS) 2330146040Stjr { 2331146040Stjr fetch_token (token, regexp, syntax); 2332146040Stjr return parse_expression (regexp, preg, token, syntax, nest, err); 2333146040Stjr } 2334146040Stjr /* else fall through */ 2335146040Stjr case OP_CLOSE_SUBEXP: 2336146040Stjr if ((token->type == OP_CLOSE_SUBEXP) && 2337146040Stjr !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2338146040Stjr { 2339146040Stjr *err = REG_ERPAREN; 2340146040Stjr return NULL; 2341146040Stjr } 2342146040Stjr /* else fall through */ 2343146040Stjr case OP_CLOSE_DUP_NUM: 2344146040Stjr /* We treat it as a normal character. */ 2345146040Stjr 2346146040Stjr /* Then we can these characters as normal characters. */ 2347146040Stjr token->type = CHARACTER; 2348146040Stjr /* mb_partial and word_char bits should be initialized already 2349146040Stjr by peek_token. */ 2350146040Stjr tree = create_token_tree (dfa, NULL, NULL, token); 2351146040Stjr if (BE (tree == NULL, 0)) 2352146040Stjr { 2353146040Stjr *err = REG_ESPACE; 2354146040Stjr return NULL; 2355146040Stjr } 2356146040Stjr break; 2357146040Stjr case ANCHOR: 2358146040Stjr if ((token->opr.ctx_type 2359146040Stjr & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2360146040Stjr && dfa->word_ops_used == 0) 2361146040Stjr init_word_char (dfa); 2362146040Stjr if (token->opr.ctx_type == WORD_DELIM 2363146040Stjr || token->opr.ctx_type == NOT_WORD_DELIM) 2364146040Stjr { 2365146040Stjr bin_tree_t *tree_first, *tree_last; 2366146040Stjr if (token->opr.ctx_type == WORD_DELIM) 2367146040Stjr { 2368146040Stjr token->opr.ctx_type = WORD_FIRST; 2369146040Stjr tree_first = create_token_tree (dfa, NULL, NULL, token); 2370146040Stjr token->opr.ctx_type = WORD_LAST; 2371146040Stjr } 2372146040Stjr else 2373146040Stjr { 2374146040Stjr token->opr.ctx_type = INSIDE_WORD; 2375146040Stjr tree_first = create_token_tree (dfa, NULL, NULL, token); 2376146040Stjr token->opr.ctx_type = INSIDE_NOTWORD; 2377146040Stjr } 2378146040Stjr tree_last = create_token_tree (dfa, NULL, NULL, token); 2379146040Stjr tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2380146040Stjr if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2381146040Stjr { 2382146040Stjr *err = REG_ESPACE; 2383146040Stjr return NULL; 2384146040Stjr } 2385146040Stjr } 2386146040Stjr else 2387146040Stjr { 2388146040Stjr tree = create_token_tree (dfa, NULL, NULL, token); 2389146040Stjr if (BE (tree == NULL, 0)) 2390146040Stjr { 2391146040Stjr *err = REG_ESPACE; 2392146040Stjr return NULL; 2393146040Stjr } 2394146040Stjr } 2395146040Stjr /* We must return here, since ANCHORs can't be followed 2396146040Stjr by repetition operators. 2397146040Stjr eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2398146040Stjr it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2399146040Stjr fetch_token (token, regexp, syntax); 2400146040Stjr return tree; 2401146040Stjr case OP_PERIOD: 2402146040Stjr tree = create_token_tree (dfa, NULL, NULL, token); 2403146040Stjr if (BE (tree == NULL, 0)) 2404146040Stjr { 2405146040Stjr *err = REG_ESPACE; 2406146040Stjr return NULL; 2407146040Stjr } 2408146040Stjr if (dfa->mb_cur_max > 1) 2409146040Stjr dfa->has_mb_node = 1; 2410146040Stjr break; 2411146040Stjr case OP_WORD: 2412146040Stjr case OP_NOTWORD: 2413146040Stjr tree = build_charclass_op (dfa, regexp->trans, 2414146040Stjr (const unsigned char *) "alnum", 2415146040Stjr (const unsigned char *) "_", 2416146040Stjr token->type == OP_NOTWORD, err); 2417146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2418146040Stjr return NULL; 2419146040Stjr break; 2420146040Stjr case OP_SPACE: 2421146040Stjr case OP_NOTSPACE: 2422146040Stjr tree = build_charclass_op (dfa, regexp->trans, 2423146040Stjr (const unsigned char *) "space", 2424146040Stjr (const unsigned char *) "", 2425146040Stjr token->type == OP_NOTSPACE, err); 2426146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2427146040Stjr return NULL; 2428146040Stjr break; 2429146040Stjr case OP_ALT: 2430146040Stjr case END_OF_RE: 2431146040Stjr return NULL; 2432146040Stjr case BACK_SLASH: 2433146040Stjr *err = REG_EESCAPE; 2434146040Stjr return NULL; 2435146040Stjr default: 2436146040Stjr /* Must not happen? */ 2437146040Stjr#ifdef DEBUG 2438146040Stjr assert (0); 2439146040Stjr#endif 2440146040Stjr return NULL; 2441146040Stjr } 2442146040Stjr fetch_token (token, regexp, syntax); 2443146040Stjr 2444146040Stjr while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2445146040Stjr || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2446146040Stjr { 2447146040Stjr tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2448146040Stjr if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2449146040Stjr return NULL; 2450146040Stjr /* In BRE consecutive duplications are not allowed. */ 2451146040Stjr if ((syntax & RE_CONTEXT_INVALID_DUP) 2452146040Stjr && (token->type == OP_DUP_ASTERISK 2453146040Stjr || token->type == OP_OPEN_DUP_NUM)) 2454146040Stjr { 2455146040Stjr *err = REG_BADRPT; 2456146040Stjr return NULL; 2457146040Stjr } 2458146040Stjr } 2459146040Stjr 2460146040Stjr return tree; 2461146040Stjr} 2462146040Stjr 2463146040Stjr/* This function build the following tree, from regular expression 2464146040Stjr (<reg_exp>): 2465146040Stjr SUBEXP 2466146040Stjr | 2467146040Stjr <reg_exp> 2468146040Stjr*/ 2469146040Stjr 2470146040Stjrstatic bin_tree_t * 2471146040Stjrparse_sub_exp (regexp, preg, token, syntax, nest, err) 2472146040Stjr re_string_t *regexp; 2473146040Stjr regex_t *preg; 2474146040Stjr re_token_t *token; 2475146040Stjr reg_syntax_t syntax; 2476146040Stjr int nest; 2477146040Stjr reg_errcode_t *err; 2478146040Stjr{ 2479146040Stjr re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2480146040Stjr bin_tree_t *tree; 2481146040Stjr size_t cur_nsub; 2482146040Stjr cur_nsub = preg->re_nsub++; 2483146040Stjr 2484146040Stjr fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2485146040Stjr 2486146040Stjr /* The subexpression may be a null string. */ 2487146040Stjr if (token->type == OP_CLOSE_SUBEXP) 2488146040Stjr tree = NULL; 2489146040Stjr else 2490146040Stjr { 2491146040Stjr tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2492146040Stjr if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2493146040Stjr *err = REG_EPAREN; 2494146040Stjr if (BE (*err != REG_NOERROR, 0)) 2495146040Stjr return NULL; 2496146040Stjr } 2497146040Stjr dfa->completed_bkref_map |= 1 << cur_nsub; 2498146040Stjr 2499146040Stjr tree = create_tree (dfa, tree, NULL, SUBEXP); 2500146040Stjr if (BE (tree == NULL, 0)) 2501146040Stjr { 2502146040Stjr *err = REG_ESPACE; 2503146040Stjr return NULL; 2504146040Stjr } 2505146040Stjr tree->token.opr.idx = cur_nsub; 2506146040Stjr return tree; 2507146040Stjr} 2508146040Stjr 2509146040Stjr/* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2510146040Stjr 2511146040Stjrstatic bin_tree_t * 2512146040Stjrparse_dup_op (elem, regexp, dfa, token, syntax, err) 2513146040Stjr bin_tree_t *elem; 2514146040Stjr re_string_t *regexp; 2515146040Stjr re_dfa_t *dfa; 2516146040Stjr re_token_t *token; 2517146040Stjr reg_syntax_t syntax; 2518146040Stjr reg_errcode_t *err; 2519146040Stjr{ 2520146040Stjr bin_tree_t *tree = NULL, *old_tree = NULL; 2521146040Stjr int i, start, end, start_idx = re_string_cur_idx (regexp); 2522146040Stjr re_token_t start_token = *token; 2523146040Stjr 2524146040Stjr if (token->type == OP_OPEN_DUP_NUM) 2525146040Stjr { 2526146040Stjr end = 0; 2527146040Stjr start = fetch_number (regexp, token, syntax); 2528146040Stjr if (start == -1) 2529146040Stjr { 2530146040Stjr if (token->type == CHARACTER && token->opr.c == ',') 2531146040Stjr start = 0; /* We treat "{,m}" as "{0,m}". */ 2532146040Stjr else 2533146040Stjr { 2534146040Stjr *err = REG_BADBR; /* <re>{} is invalid. */ 2535146040Stjr return NULL; 2536146040Stjr } 2537146040Stjr } 2538146040Stjr if (BE (start != -2, 1)) 2539146040Stjr { 2540146040Stjr /* We treat "{n}" as "{n,n}". */ 2541146040Stjr end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2542146040Stjr : ((token->type == CHARACTER && token->opr.c == ',') 2543146040Stjr ? fetch_number (regexp, token, syntax) : -2)); 2544146040Stjr } 2545146040Stjr if (BE (start == -2 || end == -2, 0)) 2546146040Stjr { 2547146040Stjr /* Invalid sequence. */ 2548146040Stjr if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2549146040Stjr { 2550146040Stjr if (token->type == END_OF_RE) 2551146040Stjr *err = REG_EBRACE; 2552146040Stjr else 2553146040Stjr *err = REG_BADBR; 2554146040Stjr 2555146040Stjr return NULL; 2556146040Stjr } 2557146040Stjr 2558146040Stjr /* If the syntax bit is set, rollback. */ 2559146040Stjr re_string_set_index (regexp, start_idx); 2560146040Stjr *token = start_token; 2561146040Stjr token->type = CHARACTER; 2562146040Stjr /* mb_partial and word_char bits should be already initialized by 2563146040Stjr peek_token. */ 2564146040Stjr return elem; 2565146040Stjr } 2566146040Stjr 2567146040Stjr if (BE (end != -1 && start > end, 0)) 2568146040Stjr { 2569146040Stjr /* First number greater than second. */ 2570146040Stjr *err = REG_BADBR; 2571146040Stjr return NULL; 2572146040Stjr } 2573146040Stjr } 2574146040Stjr else 2575146040Stjr { 2576146040Stjr start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2577146040Stjr end = (token->type == OP_DUP_QUESTION) ? 1 : -1; 2578146040Stjr } 2579146040Stjr 2580146040Stjr fetch_token (token, regexp, syntax); 2581146040Stjr 2582146040Stjr if (BE (elem == NULL, 0)) 2583146040Stjr return NULL; 2584146040Stjr if (BE (start == 0 && end == 0, 0)) 2585146040Stjr { 2586146040Stjr postorder (elem, free_tree, NULL); 2587146040Stjr return NULL; 2588146040Stjr } 2589146040Stjr 2590146040Stjr /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2591146040Stjr if (BE (start > 0, 0)) 2592146040Stjr { 2593146040Stjr tree = elem; 2594146040Stjr for (i = 2; i <= start; ++i) 2595146040Stjr { 2596146040Stjr elem = duplicate_tree (elem, dfa); 2597146040Stjr tree = create_tree (dfa, tree, elem, CONCAT); 2598146040Stjr if (BE (elem == NULL || tree == NULL, 0)) 2599146040Stjr goto parse_dup_op_espace; 2600146040Stjr } 2601146040Stjr 2602146040Stjr if (start == end) 2603146040Stjr return tree; 2604146040Stjr 2605146040Stjr /* Duplicate ELEM before it is marked optional. */ 2606146040Stjr elem = duplicate_tree (elem, dfa); 2607146040Stjr old_tree = tree; 2608146040Stjr } 2609146040Stjr else 2610146040Stjr old_tree = NULL; 2611146040Stjr 2612146040Stjr if (elem->token.type == SUBEXP) 2613146040Stjr postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2614146040Stjr 2615146040Stjr tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT)); 2616146040Stjr if (BE (tree == NULL, 0)) 2617146040Stjr goto parse_dup_op_espace; 2618146040Stjr 2619146040Stjr /* This loop is actually executed only when end != -1, 2620146040Stjr to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2621146040Stjr already created the start+1-th copy. */ 2622146040Stjr for (i = start + 2; i <= end; ++i) 2623146040Stjr { 2624146040Stjr elem = duplicate_tree (elem, dfa); 2625146040Stjr tree = create_tree (dfa, tree, elem, CONCAT); 2626146040Stjr if (BE (elem == NULL || tree == NULL, 0)) 2627146040Stjr goto parse_dup_op_espace; 2628146040Stjr 2629146040Stjr tree = create_tree (dfa, tree, NULL, OP_ALT); 2630146040Stjr if (BE (tree == NULL, 0)) 2631146040Stjr goto parse_dup_op_espace; 2632146040Stjr } 2633146040Stjr 2634146040Stjr if (old_tree) 2635146040Stjr tree = create_tree (dfa, old_tree, tree, CONCAT); 2636146040Stjr 2637146040Stjr return tree; 2638146040Stjr 2639146040Stjr parse_dup_op_espace: 2640146040Stjr *err = REG_ESPACE; 2641146040Stjr return NULL; 2642146040Stjr} 2643146040Stjr 2644146040Stjr/* Size of the names for collating symbol/equivalence_class/character_class. 2645146040Stjr I'm not sure, but maybe enough. */ 2646146040Stjr#define BRACKET_NAME_BUF_SIZE 32 2647146040Stjr 2648146040Stjr#ifndef _LIBC 2649146040Stjr /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2650146040Stjr Build the range expression which starts from START_ELEM, and ends 2651146040Stjr at END_ELEM. The result are written to MBCSET and SBCSET. 2652146040Stjr RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2653146040Stjr mbcset->range_ends, is a pointer argument sinse we may 2654146040Stjr update it. */ 2655146040Stjr 2656146040Stjrstatic reg_errcode_t 2657146040Stjr# ifdef RE_ENABLE_I18N 2658146040Stjrbuild_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2659146040Stjr re_charset_t *mbcset; 2660146040Stjr int *range_alloc; 2661146040Stjr# else /* not RE_ENABLE_I18N */ 2662146040Stjrbuild_range_exp (sbcset, start_elem, end_elem) 2663146040Stjr# endif /* not RE_ENABLE_I18N */ 2664146040Stjr re_bitset_ptr_t sbcset; 2665146040Stjr bracket_elem_t *start_elem, *end_elem; 2666146040Stjr{ 2667146040Stjr unsigned int start_ch, end_ch; 2668146040Stjr /* Equivalence Classes and Character Classes can't be a range start/end. */ 2669146040Stjr if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2670146040Stjr || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2671146040Stjr 0)) 2672146040Stjr return REG_ERANGE; 2673146040Stjr 2674146040Stjr /* We can handle no multi character collating elements without libc 2675146040Stjr support. */ 2676146040Stjr if (BE ((start_elem->type == COLL_SYM 2677146040Stjr && strlen ((char *) start_elem->opr.name) > 1) 2678146040Stjr || (end_elem->type == COLL_SYM 2679146040Stjr && strlen ((char *) end_elem->opr.name) > 1), 0)) 2680146040Stjr return REG_ECOLLATE; 2681146040Stjr 2682146040Stjr# ifdef RE_ENABLE_I18N 2683146040Stjr { 2684146040Stjr wchar_t wc, start_wc, end_wc; 2685146040Stjr wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2686146040Stjr 2687146040Stjr start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2688146040Stjr : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2689146040Stjr : 0)); 2690146040Stjr end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2691146040Stjr : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2692146040Stjr : 0)); 2693146040Stjr start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2694146040Stjr ? __btowc (start_ch) : start_elem->opr.wch); 2695146040Stjr end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2696146040Stjr ? __btowc (end_ch) : end_elem->opr.wch); 2697146040Stjr if (start_wc == WEOF || end_wc == WEOF) 2698146040Stjr return REG_ECOLLATE; 2699146040Stjr cmp_buf[0] = start_wc; 2700146040Stjr cmp_buf[4] = end_wc; 2701146040Stjr if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2702146040Stjr return REG_ERANGE; 2703146040Stjr 2704146040Stjr /* Got valid collation sequence values, add them as a new entry. 2705146040Stjr However, for !_LIBC we have no collation elements: if the 2706146040Stjr character set is single byte, the single byte character set 2707146040Stjr that we build below suffices. parse_bracket_exp passes 2708146040Stjr no MBCSET if dfa->mb_cur_max == 1. */ 2709146040Stjr if (mbcset) 2710146040Stjr { 2711146040Stjr /* Check the space of the arrays. */ 2712146040Stjr if (BE (*range_alloc == mbcset->nranges, 0)) 2713146040Stjr { 2714146040Stjr /* There is not enough space, need realloc. */ 2715146040Stjr wchar_t *new_array_start, *new_array_end; 2716146040Stjr int new_nranges; 2717146040Stjr 2718146040Stjr /* +1 in case of mbcset->nranges is 0. */ 2719146040Stjr new_nranges = 2 * mbcset->nranges + 1; 2720146040Stjr /* Use realloc since mbcset->range_starts and mbcset->range_ends 2721146040Stjr are NULL if *range_alloc == 0. */ 2722146040Stjr new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2723146040Stjr new_nranges); 2724146040Stjr new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2725146040Stjr new_nranges); 2726146040Stjr 2727146040Stjr if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2728146040Stjr return REG_ESPACE; 2729146040Stjr 2730146040Stjr mbcset->range_starts = new_array_start; 2731146040Stjr mbcset->range_ends = new_array_end; 2732146040Stjr *range_alloc = new_nranges; 2733146040Stjr } 2734146040Stjr 2735146040Stjr mbcset->range_starts[mbcset->nranges] = start_wc; 2736146040Stjr mbcset->range_ends[mbcset->nranges++] = end_wc; 2737146040Stjr } 2738146040Stjr 2739146040Stjr /* Build the table for single byte characters. */ 2740146040Stjr for (wc = 0; wc < SBC_MAX; ++wc) 2741146040Stjr { 2742146040Stjr cmp_buf[2] = wc; 2743146040Stjr if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2744146040Stjr && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2745146040Stjr bitset_set (sbcset, wc); 2746146040Stjr } 2747146040Stjr } 2748146040Stjr# else /* not RE_ENABLE_I18N */ 2749146040Stjr { 2750146040Stjr unsigned int ch; 2751146040Stjr start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2752146040Stjr : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2753146040Stjr : 0)); 2754146040Stjr end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2755146040Stjr : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2756146040Stjr : 0)); 2757146040Stjr if (start_ch > end_ch) 2758146040Stjr return REG_ERANGE; 2759146040Stjr /* Build the table for single byte characters. */ 2760146040Stjr for (ch = 0; ch < SBC_MAX; ++ch) 2761146040Stjr if (start_ch <= ch && ch <= end_ch) 2762146040Stjr bitset_set (sbcset, ch); 2763146040Stjr } 2764146040Stjr# endif /* not RE_ENABLE_I18N */ 2765146040Stjr return REG_NOERROR; 2766146040Stjr} 2767146040Stjr#endif /* not _LIBC */ 2768146040Stjr 2769146040Stjr#ifndef _LIBC 2770146040Stjr/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2771146040Stjr Build the collating element which is represented by NAME. 2772146040Stjr The result are written to MBCSET and SBCSET. 2773146040Stjr COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2774146040Stjr pointer argument since we may update it. */ 2775146040Stjr 2776146040Stjrstatic reg_errcode_t 2777146040Stjr# ifdef RE_ENABLE_I18N 2778146040Stjrbuild_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2779146040Stjr re_charset_t *mbcset; 2780146040Stjr int *coll_sym_alloc; 2781146040Stjr# else /* not RE_ENABLE_I18N */ 2782146040Stjrbuild_collating_symbol (sbcset, name) 2783146040Stjr# endif /* not RE_ENABLE_I18N */ 2784146040Stjr re_bitset_ptr_t sbcset; 2785146040Stjr const unsigned char *name; 2786146040Stjr{ 2787146040Stjr size_t name_len = strlen ((const char *) name); 2788146040Stjr if (BE (name_len != 1, 0)) 2789146040Stjr return REG_ECOLLATE; 2790146040Stjr else 2791146040Stjr { 2792146040Stjr bitset_set (sbcset, name[0]); 2793146040Stjr return REG_NOERROR; 2794146040Stjr } 2795146040Stjr} 2796146040Stjr#endif /* not _LIBC */ 2797146040Stjr 2798146040Stjr/* This function parse bracket expression like "[abc]", "[a-c]", 2799146040Stjr "[[.a-a.]]" etc. */ 2800146040Stjr 2801146040Stjrstatic bin_tree_t * 2802146040Stjrparse_bracket_exp (regexp, dfa, token, syntax, err) 2803146040Stjr re_string_t *regexp; 2804146040Stjr re_dfa_t *dfa; 2805146040Stjr re_token_t *token; 2806146040Stjr reg_syntax_t syntax; 2807146040Stjr reg_errcode_t *err; 2808146040Stjr{ 2809146040Stjr#ifdef _LIBC 2810146040Stjr const unsigned char *collseqmb; 2811146040Stjr const char *collseqwc; 2812146040Stjr uint32_t nrules; 2813146040Stjr int32_t table_size; 2814146040Stjr const int32_t *symb_table; 2815146040Stjr const unsigned char *extra; 2816146040Stjr 2817146040Stjr /* Local function for parse_bracket_exp used in _LIBC environement. 2818146040Stjr Seek the collating symbol entry correspondings to NAME. 2819146040Stjr Return the index of the symbol in the SYMB_TABLE. */ 2820146040Stjr 2821146040Stjr auto inline int32_t 2822146040Stjr __attribute ((always_inline)) 2823146040Stjr seek_collating_symbol_entry (name, name_len) 2824146040Stjr const unsigned char *name; 2825146040Stjr size_t name_len; 2826146040Stjr { 2827146040Stjr int32_t hash = elem_hash ((const char *) name, name_len); 2828146040Stjr int32_t elem = hash % table_size; 2829146040Stjr int32_t second = hash % (table_size - 2); 2830146040Stjr while (symb_table[2 * elem] != 0) 2831146040Stjr { 2832146040Stjr /* First compare the hashing value. */ 2833146040Stjr if (symb_table[2 * elem] == hash 2834146040Stjr /* Compare the length of the name. */ 2835146040Stjr && name_len == extra[symb_table[2 * elem + 1]] 2836146040Stjr /* Compare the name. */ 2837146040Stjr && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2838146040Stjr name_len) == 0) 2839146040Stjr { 2840146040Stjr /* Yep, this is the entry. */ 2841146040Stjr break; 2842146040Stjr } 2843146040Stjr 2844146040Stjr /* Next entry. */ 2845146040Stjr elem += second; 2846146040Stjr } 2847146040Stjr return elem; 2848146040Stjr } 2849146040Stjr 2850146040Stjr /* Local function for parse_bracket_exp used in _LIBC environement. 2851146040Stjr Look up the collation sequence value of BR_ELEM. 2852146040Stjr Return the value if succeeded, UINT_MAX otherwise. */ 2853146040Stjr 2854146040Stjr auto inline unsigned int 2855146040Stjr __attribute ((always_inline)) 2856146040Stjr lookup_collation_sequence_value (br_elem) 2857146040Stjr bracket_elem_t *br_elem; 2858146040Stjr { 2859146040Stjr if (br_elem->type == SB_CHAR) 2860146040Stjr { 2861146040Stjr /* 2862146040Stjr if (MB_CUR_MAX == 1) 2863146040Stjr */ 2864146040Stjr if (nrules == 0) 2865146040Stjr return collseqmb[br_elem->opr.ch]; 2866146040Stjr else 2867146040Stjr { 2868146040Stjr wint_t wc = __btowc (br_elem->opr.ch); 2869146040Stjr return __collseq_table_lookup (collseqwc, wc); 2870146040Stjr } 2871146040Stjr } 2872146040Stjr else if (br_elem->type == MB_CHAR) 2873146040Stjr { 2874146040Stjr return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2875146040Stjr } 2876146040Stjr else if (br_elem->type == COLL_SYM) 2877146040Stjr { 2878146040Stjr size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2879146040Stjr if (nrules != 0) 2880146040Stjr { 2881146040Stjr int32_t elem, idx; 2882146040Stjr elem = seek_collating_symbol_entry (br_elem->opr.name, 2883146040Stjr sym_name_len); 2884146040Stjr if (symb_table[2 * elem] != 0) 2885146040Stjr { 2886146040Stjr /* We found the entry. */ 2887146040Stjr idx = symb_table[2 * elem + 1]; 2888146040Stjr /* Skip the name of collating element name. */ 2889146040Stjr idx += 1 + extra[idx]; 2890146040Stjr /* Skip the byte sequence of the collating element. */ 2891146040Stjr idx += 1 + extra[idx]; 2892146040Stjr /* Adjust for the alignment. */ 2893146040Stjr idx = (idx + 3) & ~3; 2894146040Stjr /* Skip the multibyte collation sequence value. */ 2895146040Stjr idx += sizeof (unsigned int); 2896146040Stjr /* Skip the wide char sequence of the collating element. */ 2897146040Stjr idx += sizeof (unsigned int) * 2898146040Stjr (1 + *(unsigned int *) (extra + idx)); 2899146040Stjr /* Return the collation sequence value. */ 2900146040Stjr return *(unsigned int *) (extra + idx); 2901146040Stjr } 2902146040Stjr else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2903146040Stjr { 2904146040Stjr /* No valid character. Match it as a single byte 2905146040Stjr character. */ 2906146040Stjr return collseqmb[br_elem->opr.name[0]]; 2907146040Stjr } 2908146040Stjr } 2909146040Stjr else if (sym_name_len == 1) 2910146040Stjr return collseqmb[br_elem->opr.name[0]]; 2911146040Stjr } 2912146040Stjr return UINT_MAX; 2913146040Stjr } 2914146040Stjr 2915146040Stjr /* Local function for parse_bracket_exp used in _LIBC environement. 2916146040Stjr Build the range expression which starts from START_ELEM, and ends 2917146040Stjr at END_ELEM. The result are written to MBCSET and SBCSET. 2918146040Stjr RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2919146040Stjr mbcset->range_ends, is a pointer argument sinse we may 2920146040Stjr update it. */ 2921146040Stjr 2922146040Stjr auto inline reg_errcode_t 2923146040Stjr __attribute ((always_inline)) 2924146040Stjr build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2925146040Stjr re_charset_t *mbcset; 2926146040Stjr int *range_alloc; 2927146040Stjr re_bitset_ptr_t sbcset; 2928146040Stjr bracket_elem_t *start_elem, *end_elem; 2929146040Stjr { 2930146040Stjr unsigned int ch; 2931146040Stjr uint32_t start_collseq; 2932146040Stjr uint32_t end_collseq; 2933146040Stjr 2934146040Stjr /* Equivalence Classes and Character Classes can't be a range 2935146040Stjr start/end. */ 2936146040Stjr if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2937146040Stjr || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2938146040Stjr 0)) 2939146040Stjr return REG_ERANGE; 2940146040Stjr 2941146040Stjr start_collseq = lookup_collation_sequence_value (start_elem); 2942146040Stjr end_collseq = lookup_collation_sequence_value (end_elem); 2943146040Stjr /* Check start/end collation sequence values. */ 2944146040Stjr if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2945146040Stjr return REG_ECOLLATE; 2946146040Stjr if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2947146040Stjr return REG_ERANGE; 2948146040Stjr 2949146040Stjr /* Got valid collation sequence values, add them as a new entry. 2950146040Stjr However, if we have no collation elements, and the character set 2951146040Stjr is single byte, the single byte character set that we 2952146040Stjr build below suffices. */ 2953146040Stjr if (nrules > 0 || dfa->mb_cur_max > 1) 2954146040Stjr { 2955146040Stjr /* Check the space of the arrays. */ 2956146040Stjr if (BE (*range_alloc == mbcset->nranges, 0)) 2957146040Stjr { 2958146040Stjr /* There is not enough space, need realloc. */ 2959146040Stjr uint32_t *new_array_start; 2960146040Stjr uint32_t *new_array_end; 2961146040Stjr int new_nranges; 2962146040Stjr 2963146040Stjr /* +1 in case of mbcset->nranges is 0. */ 2964146040Stjr new_nranges = 2 * mbcset->nranges + 1; 2965146040Stjr new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2966146040Stjr new_nranges); 2967146040Stjr new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2968146040Stjr new_nranges); 2969146040Stjr 2970146040Stjr if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2971146040Stjr return REG_ESPACE; 2972146040Stjr 2973146040Stjr mbcset->range_starts = new_array_start; 2974146040Stjr mbcset->range_ends = new_array_end; 2975146040Stjr *range_alloc = new_nranges; 2976146040Stjr } 2977146040Stjr 2978146040Stjr mbcset->range_starts[mbcset->nranges] = start_collseq; 2979146040Stjr mbcset->range_ends[mbcset->nranges++] = end_collseq; 2980146040Stjr } 2981146040Stjr 2982146040Stjr /* Build the table for single byte characters. */ 2983146040Stjr for (ch = 0; ch < SBC_MAX; ch++) 2984146040Stjr { 2985146040Stjr uint32_t ch_collseq; 2986146040Stjr /* 2987146040Stjr if (MB_CUR_MAX == 1) 2988146040Stjr */ 2989146040Stjr if (nrules == 0) 2990146040Stjr ch_collseq = collseqmb[ch]; 2991146040Stjr else 2992146040Stjr ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2993146040Stjr if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2994146040Stjr bitset_set (sbcset, ch); 2995146040Stjr } 2996146040Stjr return REG_NOERROR; 2997146040Stjr } 2998146040Stjr 2999146040Stjr /* Local function for parse_bracket_exp used in _LIBC environement. 3000146040Stjr Build the collating element which is represented by NAME. 3001146040Stjr The result are written to MBCSET and SBCSET. 3002146040Stjr COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 3003146040Stjr pointer argument sinse we may update it. */ 3004146040Stjr 3005146040Stjr auto inline reg_errcode_t 3006146040Stjr __attribute ((always_inline)) 3007146040Stjr build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 3008146040Stjr re_charset_t *mbcset; 3009146040Stjr int *coll_sym_alloc; 3010146040Stjr re_bitset_ptr_t sbcset; 3011146040Stjr const unsigned char *name; 3012146040Stjr { 3013146040Stjr int32_t elem, idx; 3014146040Stjr size_t name_len = strlen ((const char *) name); 3015146040Stjr if (nrules != 0) 3016146040Stjr { 3017146040Stjr elem = seek_collating_symbol_entry (name, name_len); 3018146040Stjr if (symb_table[2 * elem] != 0) 3019146040Stjr { 3020146040Stjr /* We found the entry. */ 3021146040Stjr idx = symb_table[2 * elem + 1]; 3022146040Stjr /* Skip the name of collating element name. */ 3023146040Stjr idx += 1 + extra[idx]; 3024146040Stjr } 3025146040Stjr else if (symb_table[2 * elem] == 0 && name_len == 1) 3026146040Stjr { 3027146040Stjr /* No valid character, treat it as a normal 3028146040Stjr character. */ 3029146040Stjr bitset_set (sbcset, name[0]); 3030146040Stjr return REG_NOERROR; 3031146040Stjr } 3032146040Stjr else 3033146040Stjr return REG_ECOLLATE; 3034146040Stjr 3035146040Stjr /* Got valid collation sequence, add it as a new entry. */ 3036146040Stjr /* Check the space of the arrays. */ 3037146040Stjr if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3038146040Stjr { 3039146040Stjr /* Not enough, realloc it. */ 3040146040Stjr /* +1 in case of mbcset->ncoll_syms is 0. */ 3041146040Stjr int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3042146040Stjr /* Use realloc since mbcset->coll_syms is NULL 3043146040Stjr if *alloc == 0. */ 3044146040Stjr int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3045146040Stjr new_coll_sym_alloc); 3046146040Stjr if (BE (new_coll_syms == NULL, 0)) 3047146040Stjr return REG_ESPACE; 3048146040Stjr mbcset->coll_syms = new_coll_syms; 3049146040Stjr *coll_sym_alloc = new_coll_sym_alloc; 3050146040Stjr } 3051146040Stjr mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3052146040Stjr return REG_NOERROR; 3053146040Stjr } 3054146040Stjr else 3055146040Stjr { 3056146040Stjr if (BE (name_len != 1, 0)) 3057146040Stjr return REG_ECOLLATE; 3058146040Stjr else 3059146040Stjr { 3060146040Stjr bitset_set (sbcset, name[0]); 3061146040Stjr return REG_NOERROR; 3062146040Stjr } 3063146040Stjr } 3064146040Stjr } 3065146040Stjr#endif 3066146040Stjr 3067146040Stjr re_token_t br_token; 3068146040Stjr re_bitset_ptr_t sbcset; 3069146040Stjr#ifdef RE_ENABLE_I18N 3070146040Stjr re_charset_t *mbcset; 3071146040Stjr int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3072146040Stjr int equiv_class_alloc = 0, char_class_alloc = 0; 3073146040Stjr#endif /* not RE_ENABLE_I18N */ 3074146040Stjr int non_match = 0; 3075146040Stjr bin_tree_t *work_tree; 3076146040Stjr int token_len; 3077146040Stjr int first_round = 1; 3078146040Stjr#ifdef _LIBC 3079146040Stjr collseqmb = (const unsigned char *) 3080146040Stjr _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3081146040Stjr nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3082146040Stjr if (nrules) 3083146040Stjr { 3084146040Stjr /* 3085146040Stjr if (MB_CUR_MAX > 1) 3086146040Stjr */ 3087146040Stjr collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3088146040Stjr table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3089146040Stjr symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3090146040Stjr _NL_COLLATE_SYMB_TABLEMB); 3091146040Stjr extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3092146040Stjr _NL_COLLATE_SYMB_EXTRAMB); 3093146040Stjr } 3094146040Stjr#endif 3095146040Stjr sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); 3096146040Stjr#ifdef RE_ENABLE_I18N 3097146040Stjr mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3098146040Stjr#endif /* RE_ENABLE_I18N */ 3099146040Stjr#ifdef RE_ENABLE_I18N 3100146040Stjr if (BE (sbcset == NULL || mbcset == NULL, 0)) 3101146040Stjr#else 3102146040Stjr if (BE (sbcset == NULL, 0)) 3103146040Stjr#endif /* RE_ENABLE_I18N */ 3104146040Stjr { 3105146040Stjr *err = REG_ESPACE; 3106146040Stjr return NULL; 3107146040Stjr } 3108146040Stjr 3109146040Stjr token_len = peek_token_bracket (token, regexp, syntax); 3110146040Stjr if (BE (token->type == END_OF_RE, 0)) 3111146040Stjr { 3112146040Stjr *err = REG_BADPAT; 3113146040Stjr goto parse_bracket_exp_free_return; 3114146040Stjr } 3115146040Stjr if (token->type == OP_NON_MATCH_LIST) 3116146040Stjr { 3117146040Stjr#ifdef RE_ENABLE_I18N 3118146040Stjr mbcset->non_match = 1; 3119146040Stjr#endif /* not RE_ENABLE_I18N */ 3120146040Stjr non_match = 1; 3121146040Stjr if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3122146040Stjr bitset_set (sbcset, '\0'); 3123146040Stjr re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3124146040Stjr token_len = peek_token_bracket (token, regexp, syntax); 3125146040Stjr if (BE (token->type == END_OF_RE, 0)) 3126146040Stjr { 3127146040Stjr *err = REG_BADPAT; 3128146040Stjr goto parse_bracket_exp_free_return; 3129146040Stjr } 3130146040Stjr } 3131146040Stjr 3132146040Stjr /* We treat the first ']' as a normal character. */ 3133146040Stjr if (token->type == OP_CLOSE_BRACKET) 3134146040Stjr token->type = CHARACTER; 3135146040Stjr 3136146040Stjr while (1) 3137146040Stjr { 3138146040Stjr bracket_elem_t start_elem, end_elem; 3139146040Stjr unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3140146040Stjr unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3141146040Stjr reg_errcode_t ret; 3142146040Stjr int token_len2 = 0, is_range_exp = 0; 3143146040Stjr re_token_t token2; 3144146040Stjr 3145146040Stjr start_elem.opr.name = start_name_buf; 3146146040Stjr ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3147146040Stjr syntax, first_round); 3148146040Stjr if (BE (ret != REG_NOERROR, 0)) 3149146040Stjr { 3150146040Stjr *err = ret; 3151146040Stjr goto parse_bracket_exp_free_return; 3152146040Stjr } 3153146040Stjr first_round = 0; 3154146040Stjr 3155146040Stjr /* Get information about the next token. We need it in any case. */ 3156146040Stjr token_len = peek_token_bracket (token, regexp, syntax); 3157146040Stjr 3158146040Stjr /* Do not check for ranges if we know they are not allowed. */ 3159146040Stjr if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3160146040Stjr { 3161146040Stjr if (BE (token->type == END_OF_RE, 0)) 3162146040Stjr { 3163146040Stjr *err = REG_EBRACK; 3164146040Stjr goto parse_bracket_exp_free_return; 3165146040Stjr } 3166146040Stjr if (token->type == OP_CHARSET_RANGE) 3167146040Stjr { 3168146040Stjr re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3169146040Stjr token_len2 = peek_token_bracket (&token2, regexp, syntax); 3170146040Stjr if (BE (token2.type == END_OF_RE, 0)) 3171146040Stjr { 3172146040Stjr *err = REG_EBRACK; 3173146040Stjr goto parse_bracket_exp_free_return; 3174146040Stjr } 3175146040Stjr if (token2.type == OP_CLOSE_BRACKET) 3176146040Stjr { 3177146040Stjr /* We treat the last '-' as a normal character. */ 3178146040Stjr re_string_skip_bytes (regexp, -token_len); 3179146040Stjr token->type = CHARACTER; 3180146040Stjr } 3181146040Stjr else 3182146040Stjr is_range_exp = 1; 3183146040Stjr } 3184146040Stjr } 3185146040Stjr 3186146040Stjr if (is_range_exp == 1) 3187146040Stjr { 3188146040Stjr end_elem.opr.name = end_name_buf; 3189146040Stjr ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3190146040Stjr dfa, syntax, 1); 3191146040Stjr if (BE (ret != REG_NOERROR, 0)) 3192146040Stjr { 3193146040Stjr *err = ret; 3194146040Stjr goto parse_bracket_exp_free_return; 3195146040Stjr } 3196146040Stjr 3197146040Stjr token_len = peek_token_bracket (token, regexp, syntax); 3198146040Stjr 3199146040Stjr#ifdef _LIBC 3200146040Stjr *err = build_range_exp (sbcset, mbcset, &range_alloc, 3201146040Stjr &start_elem, &end_elem); 3202146040Stjr#else 3203146040Stjr# ifdef RE_ENABLE_I18N 3204146040Stjr *err = build_range_exp (sbcset, 3205146040Stjr dfa->mb_cur_max > 1 ? mbcset : NULL, 3206146040Stjr &range_alloc, &start_elem, &end_elem); 3207146040Stjr# else 3208146040Stjr *err = build_range_exp (sbcset, &start_elem, &end_elem); 3209146040Stjr# endif 3210146040Stjr#endif /* RE_ENABLE_I18N */ 3211146040Stjr if (BE (*err != REG_NOERROR, 0)) 3212146040Stjr goto parse_bracket_exp_free_return; 3213146040Stjr } 3214146040Stjr else 3215146040Stjr { 3216146040Stjr switch (start_elem.type) 3217146040Stjr { 3218146040Stjr case SB_CHAR: 3219146040Stjr bitset_set (sbcset, start_elem.opr.ch); 3220146040Stjr break; 3221146040Stjr#ifdef RE_ENABLE_I18N 3222146040Stjr case MB_CHAR: 3223146040Stjr /* Check whether the array has enough space. */ 3224146040Stjr if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3225146040Stjr { 3226146040Stjr wchar_t *new_mbchars; 3227146040Stjr /* Not enough, realloc it. */ 3228146040Stjr /* +1 in case of mbcset->nmbchars is 0. */ 3229146040Stjr mbchar_alloc = 2 * mbcset->nmbchars + 1; 3230146040Stjr /* Use realloc since array is NULL if *alloc == 0. */ 3231146040Stjr new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3232146040Stjr mbchar_alloc); 3233146040Stjr if (BE (new_mbchars == NULL, 0)) 3234146040Stjr goto parse_bracket_exp_espace; 3235146040Stjr mbcset->mbchars = new_mbchars; 3236146040Stjr } 3237146040Stjr mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3238146040Stjr break; 3239146040Stjr#endif /* RE_ENABLE_I18N */ 3240146040Stjr case EQUIV_CLASS: 3241146040Stjr *err = build_equiv_class (sbcset, 3242146040Stjr#ifdef RE_ENABLE_I18N 3243146040Stjr mbcset, &equiv_class_alloc, 3244146040Stjr#endif /* RE_ENABLE_I18N */ 3245146040Stjr start_elem.opr.name); 3246146040Stjr if (BE (*err != REG_NOERROR, 0)) 3247146040Stjr goto parse_bracket_exp_free_return; 3248146040Stjr break; 3249146040Stjr case COLL_SYM: 3250146040Stjr *err = build_collating_symbol (sbcset, 3251146040Stjr#ifdef RE_ENABLE_I18N 3252146040Stjr mbcset, &coll_sym_alloc, 3253146040Stjr#endif /* RE_ENABLE_I18N */ 3254146040Stjr start_elem.opr.name); 3255146040Stjr if (BE (*err != REG_NOERROR, 0)) 3256146040Stjr goto parse_bracket_exp_free_return; 3257146040Stjr break; 3258146040Stjr case CHAR_CLASS: 3259146040Stjr *err = build_charclass (regexp->trans, sbcset, 3260146040Stjr#ifdef RE_ENABLE_I18N 3261146040Stjr mbcset, &char_class_alloc, 3262146040Stjr#endif /* RE_ENABLE_I18N */ 3263146040Stjr start_elem.opr.name, syntax); 3264146040Stjr if (BE (*err != REG_NOERROR, 0)) 3265146040Stjr goto parse_bracket_exp_free_return; 3266146040Stjr break; 3267146040Stjr default: 3268146040Stjr assert (0); 3269146040Stjr break; 3270146040Stjr } 3271146040Stjr } 3272146040Stjr if (BE (token->type == END_OF_RE, 0)) 3273146040Stjr { 3274146040Stjr *err = REG_EBRACK; 3275146040Stjr goto parse_bracket_exp_free_return; 3276146040Stjr } 3277146040Stjr if (token->type == OP_CLOSE_BRACKET) 3278146040Stjr break; 3279146040Stjr } 3280146040Stjr 3281146040Stjr re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3282146040Stjr 3283146040Stjr /* If it is non-matching list. */ 3284146040Stjr if (non_match) 3285146040Stjr bitset_not (sbcset); 3286146040Stjr 3287146040Stjr#ifdef RE_ENABLE_I18N 3288146040Stjr /* Ensure only single byte characters are set. */ 3289146040Stjr if (dfa->mb_cur_max > 1) 3290146040Stjr bitset_mask (sbcset, dfa->sb_char); 3291146040Stjr 3292146040Stjr if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3293146040Stjr || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3294146040Stjr || mbcset->non_match))) 3295146040Stjr { 3296146040Stjr bin_tree_t *mbc_tree; 3297146040Stjr int sbc_idx; 3298146040Stjr /* Build a tree for complex bracket. */ 3299146040Stjr dfa->has_mb_node = 1; 3300146040Stjr br_token.type = COMPLEX_BRACKET; 3301146040Stjr br_token.opr.mbcset = mbcset; 3302146040Stjr mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3303146040Stjr if (BE (mbc_tree == NULL, 0)) 3304146040Stjr goto parse_bracket_exp_espace; 3305146040Stjr for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx) 3306146040Stjr if (sbcset[sbc_idx]) 3307146040Stjr break; 3308146040Stjr /* If there are no bits set in sbcset, there is no point 3309146040Stjr of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3310146040Stjr if (sbc_idx < BITSET_UINTS) 3311146040Stjr { 3312146040Stjr /* Build a tree for simple bracket. */ 3313146040Stjr br_token.type = SIMPLE_BRACKET; 3314146040Stjr br_token.opr.sbcset = sbcset; 3315146040Stjr work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3316146040Stjr if (BE (work_tree == NULL, 0)) 3317146040Stjr goto parse_bracket_exp_espace; 3318146040Stjr 3319146040Stjr /* Then join them by ALT node. */ 3320146040Stjr work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3321146040Stjr if (BE (work_tree == NULL, 0)) 3322146040Stjr goto parse_bracket_exp_espace; 3323146040Stjr } 3324146040Stjr else 3325146040Stjr { 3326146040Stjr re_free (sbcset); 3327146040Stjr work_tree = mbc_tree; 3328146040Stjr } 3329146040Stjr } 3330146040Stjr else 3331146040Stjr#endif /* not RE_ENABLE_I18N */ 3332146040Stjr { 3333146040Stjr#ifdef RE_ENABLE_I18N 3334146040Stjr free_charset (mbcset); 3335146040Stjr#endif 3336146040Stjr /* Build a tree for simple bracket. */ 3337146040Stjr br_token.type = SIMPLE_BRACKET; 3338146040Stjr br_token.opr.sbcset = sbcset; 3339146040Stjr work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3340146040Stjr if (BE (work_tree == NULL, 0)) 3341146040Stjr goto parse_bracket_exp_espace; 3342146040Stjr } 3343146040Stjr return work_tree; 3344146040Stjr 3345146040Stjr parse_bracket_exp_espace: 3346146040Stjr *err = REG_ESPACE; 3347146040Stjr parse_bracket_exp_free_return: 3348146040Stjr re_free (sbcset); 3349146040Stjr#ifdef RE_ENABLE_I18N 3350146040Stjr free_charset (mbcset); 3351146040Stjr#endif /* RE_ENABLE_I18N */ 3352146040Stjr return NULL; 3353146040Stjr} 3354146040Stjr 3355146040Stjr/* Parse an element in the bracket expression. */ 3356146040Stjr 3357146040Stjrstatic reg_errcode_t 3358146040Stjrparse_bracket_element (elem, regexp, token, token_len, dfa, syntax, 3359146040Stjr accept_hyphen) 3360146040Stjr bracket_elem_t *elem; 3361146040Stjr re_string_t *regexp; 3362146040Stjr re_token_t *token; 3363146040Stjr int token_len; 3364146040Stjr re_dfa_t *dfa; 3365146040Stjr reg_syntax_t syntax; 3366146040Stjr int accept_hyphen; 3367146040Stjr{ 3368146040Stjr#ifdef RE_ENABLE_I18N 3369146040Stjr int cur_char_size; 3370146040Stjr cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3371146040Stjr if (cur_char_size > 1) 3372146040Stjr { 3373146040Stjr elem->type = MB_CHAR; 3374146040Stjr elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3375146040Stjr re_string_skip_bytes (regexp, cur_char_size); 3376146040Stjr return REG_NOERROR; 3377146040Stjr } 3378146040Stjr#endif /* RE_ENABLE_I18N */ 3379146040Stjr re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3380146040Stjr if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3381146040Stjr || token->type == OP_OPEN_EQUIV_CLASS) 3382146040Stjr return parse_bracket_symbol (elem, regexp, token); 3383146040Stjr if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3384146040Stjr { 3385146040Stjr /* A '-' must only appear as anything but a range indicator before 3386146040Stjr the closing bracket. Everything else is an error. */ 3387146040Stjr re_token_t token2; 3388146040Stjr (void) peek_token_bracket (&token2, regexp, syntax); 3389146040Stjr if (token2.type != OP_CLOSE_BRACKET) 3390146040Stjr /* The actual error value is not standardized since this whole 3391146040Stjr case is undefined. But ERANGE makes good sense. */ 3392146040Stjr return REG_ERANGE; 3393146040Stjr } 3394146040Stjr elem->type = SB_CHAR; 3395146040Stjr elem->opr.ch = token->opr.c; 3396146040Stjr return REG_NOERROR; 3397146040Stjr} 3398146040Stjr 3399146040Stjr/* Parse a bracket symbol in the bracket expression. Bracket symbols are 3400146040Stjr such as [:<character_class>:], [.<collating_element>.], and 3401146040Stjr [=<equivalent_class>=]. */ 3402146040Stjr 3403146040Stjrstatic reg_errcode_t 3404146040Stjrparse_bracket_symbol (elem, regexp, token) 3405146040Stjr bracket_elem_t *elem; 3406146040Stjr re_string_t *regexp; 3407146040Stjr re_token_t *token; 3408146040Stjr{ 3409146040Stjr unsigned char ch, delim = token->opr.c; 3410146040Stjr int i = 0; 3411146040Stjr if (re_string_eoi(regexp)) 3412146040Stjr return REG_EBRACK; 3413146040Stjr for (;; ++i) 3414146040Stjr { 3415146040Stjr if (i >= BRACKET_NAME_BUF_SIZE) 3416146040Stjr return REG_EBRACK; 3417146040Stjr if (token->type == OP_OPEN_CHAR_CLASS) 3418146040Stjr ch = re_string_fetch_byte_case (regexp); 3419146040Stjr else 3420146040Stjr ch = re_string_fetch_byte (regexp); 3421146040Stjr if (re_string_eoi(regexp)) 3422146040Stjr return REG_EBRACK; 3423146040Stjr if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3424146040Stjr break; 3425146040Stjr elem->opr.name[i] = ch; 3426146040Stjr } 3427146040Stjr re_string_skip_bytes (regexp, 1); 3428146040Stjr elem->opr.name[i] = '\0'; 3429146040Stjr switch (token->type) 3430146040Stjr { 3431146040Stjr case OP_OPEN_COLL_ELEM: 3432146040Stjr elem->type = COLL_SYM; 3433146040Stjr break; 3434146040Stjr case OP_OPEN_EQUIV_CLASS: 3435146040Stjr elem->type = EQUIV_CLASS; 3436146040Stjr break; 3437146040Stjr case OP_OPEN_CHAR_CLASS: 3438146040Stjr elem->type = CHAR_CLASS; 3439146040Stjr break; 3440146040Stjr default: 3441146040Stjr break; 3442146040Stjr } 3443146040Stjr return REG_NOERROR; 3444146040Stjr} 3445146040Stjr 3446146040Stjr /* Helper function for parse_bracket_exp. 3447146040Stjr Build the equivalence class which is represented by NAME. 3448146040Stjr The result are written to MBCSET and SBCSET. 3449146040Stjr EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3450146040Stjr is a pointer argument sinse we may update it. */ 3451146040Stjr 3452146040Stjrstatic reg_errcode_t 3453146040Stjr#ifdef RE_ENABLE_I18N 3454146040Stjrbuild_equiv_class (sbcset, mbcset, equiv_class_alloc, name) 3455146040Stjr re_charset_t *mbcset; 3456146040Stjr int *equiv_class_alloc; 3457146040Stjr#else /* not RE_ENABLE_I18N */ 3458146040Stjrbuild_equiv_class (sbcset, name) 3459146040Stjr#endif /* not RE_ENABLE_I18N */ 3460146040Stjr re_bitset_ptr_t sbcset; 3461146040Stjr const unsigned char *name; 3462146040Stjr{ 3463146040Stjr#if defined _LIBC 3464146040Stjr uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3465146040Stjr if (nrules != 0) 3466146040Stjr { 3467146040Stjr const int32_t *table, *indirect; 3468146040Stjr const unsigned char *weights, *extra, *cp; 3469146040Stjr unsigned char char_buf[2]; 3470146040Stjr int32_t idx1, idx2; 3471146040Stjr unsigned int ch; 3472146040Stjr size_t len; 3473146040Stjr /* This #include defines a local function! */ 3474146040Stjr# include <locale/weight.h> 3475146040Stjr /* Calculate the index for equivalence class. */ 3476146040Stjr cp = name; 3477146040Stjr table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3478146040Stjr weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3479146040Stjr _NL_COLLATE_WEIGHTMB); 3480146040Stjr extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3481146040Stjr _NL_COLLATE_EXTRAMB); 3482146040Stjr indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3483146040Stjr _NL_COLLATE_INDIRECTMB); 3484146040Stjr idx1 = findidx (&cp); 3485146040Stjr if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3486146040Stjr /* This isn't a valid character. */ 3487146040Stjr return REG_ECOLLATE; 3488146040Stjr 3489146040Stjr /* Build single byte matcing table for this equivalence class. */ 3490146040Stjr char_buf[1] = (unsigned char) '\0'; 3491146040Stjr len = weights[idx1]; 3492146040Stjr for (ch = 0; ch < SBC_MAX; ++ch) 3493146040Stjr { 3494146040Stjr char_buf[0] = ch; 3495146040Stjr cp = char_buf; 3496146040Stjr idx2 = findidx (&cp); 3497146040Stjr/* 3498146040Stjr idx2 = table[ch]; 3499146040Stjr*/ 3500146040Stjr if (idx2 == 0) 3501146040Stjr /* This isn't a valid character. */ 3502146040Stjr continue; 3503146040Stjr if (len == weights[idx2]) 3504146040Stjr { 3505146040Stjr int cnt = 0; 3506146040Stjr while (cnt <= len && 3507146040Stjr weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) 3508146040Stjr ++cnt; 3509146040Stjr 3510146040Stjr if (cnt > len) 3511146040Stjr bitset_set (sbcset, ch); 3512146040Stjr } 3513146040Stjr } 3514146040Stjr /* Check whether the array has enough space. */ 3515146040Stjr if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3516146040Stjr { 3517146040Stjr /* Not enough, realloc it. */ 3518146040Stjr /* +1 in case of mbcset->nequiv_classes is 0. */ 3519146040Stjr int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3520146040Stjr /* Use realloc since the array is NULL if *alloc == 0. */ 3521146040Stjr int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3522146040Stjr int32_t, 3523146040Stjr new_equiv_class_alloc); 3524146040Stjr if (BE (new_equiv_classes == NULL, 0)) 3525146040Stjr return REG_ESPACE; 3526146040Stjr mbcset->equiv_classes = new_equiv_classes; 3527146040Stjr *equiv_class_alloc = new_equiv_class_alloc; 3528146040Stjr } 3529146040Stjr mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3530146040Stjr } 3531146040Stjr else 3532146040Stjr#endif /* _LIBC */ 3533146040Stjr { 3534146040Stjr if (BE (strlen ((const char *) name) != 1, 0)) 3535146040Stjr return REG_ECOLLATE; 3536146040Stjr bitset_set (sbcset, *name); 3537146040Stjr } 3538146040Stjr return REG_NOERROR; 3539146040Stjr} 3540146040Stjr 3541146040Stjr /* Helper function for parse_bracket_exp. 3542146040Stjr Build the character class which is represented by NAME. 3543146040Stjr The result are written to MBCSET and SBCSET. 3544146040Stjr CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3545146040Stjr is a pointer argument sinse we may update it. */ 3546146040Stjr 3547146040Stjrstatic reg_errcode_t 3548146040Stjr#ifdef RE_ENABLE_I18N 3549146040Stjrbuild_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax) 3550146040Stjr re_charset_t *mbcset; 3551146040Stjr int *char_class_alloc; 3552146040Stjr#else /* not RE_ENABLE_I18N */ 3553146040Stjrbuild_charclass (trans, sbcset, class_name, syntax) 3554146040Stjr#endif /* not RE_ENABLE_I18N */ 3555146040Stjr unsigned RE_TRANSLATE_TYPE trans; 3556146040Stjr re_bitset_ptr_t sbcset; 3557146040Stjr const unsigned char *class_name; 3558146040Stjr reg_syntax_t syntax; 3559146040Stjr{ 3560146040Stjr int i; 3561146040Stjr const char *name = (const char *) class_name; 3562146040Stjr 3563146040Stjr /* In case of REG_ICASE "upper" and "lower" match the both of 3564146040Stjr upper and lower cases. */ 3565146040Stjr if ((syntax & RE_ICASE) 3566146040Stjr && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3567146040Stjr name = "alpha"; 3568146040Stjr 3569146040Stjr#ifdef RE_ENABLE_I18N 3570146040Stjr /* Check the space of the arrays. */ 3571146040Stjr if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3572146040Stjr { 3573146040Stjr /* Not enough, realloc it. */ 3574146040Stjr /* +1 in case of mbcset->nchar_classes is 0. */ 3575146040Stjr int new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3576146040Stjr /* Use realloc since array is NULL if *alloc == 0. */ 3577146040Stjr wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3578146040Stjr new_char_class_alloc); 3579146040Stjr if (BE (new_char_classes == NULL, 0)) 3580146040Stjr return REG_ESPACE; 3581146040Stjr mbcset->char_classes = new_char_classes; 3582146040Stjr *char_class_alloc = new_char_class_alloc; 3583146040Stjr } 3584146040Stjr mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3585146040Stjr#endif /* RE_ENABLE_I18N */ 3586146040Stjr 3587146040Stjr#define BUILD_CHARCLASS_LOOP(ctype_func) \ 3588146040Stjr for (i = 0; i < SBC_MAX; ++i) \ 3589146040Stjr { \ 3590146040Stjr if (ctype_func (i)) \ 3591146040Stjr { \ 3592146040Stjr int ch = trans ? trans[i] : i; \ 3593146040Stjr bitset_set (sbcset, ch); \ 3594146040Stjr } \ 3595146040Stjr } 3596146040Stjr 3597146040Stjr if (strcmp (name, "alnum") == 0) 3598146040Stjr BUILD_CHARCLASS_LOOP (isalnum) 3599146040Stjr else if (strcmp (name, "cntrl") == 0) 3600146040Stjr BUILD_CHARCLASS_LOOP (iscntrl) 3601146040Stjr else if (strcmp (name, "lower") == 0) 3602146040Stjr BUILD_CHARCLASS_LOOP (islower) 3603146040Stjr else if (strcmp (name, "space") == 0) 3604146040Stjr BUILD_CHARCLASS_LOOP (isspace) 3605146040Stjr else if (strcmp (name, "alpha") == 0) 3606146040Stjr BUILD_CHARCLASS_LOOP (isalpha) 3607146040Stjr else if (strcmp (name, "digit") == 0) 3608146040Stjr BUILD_CHARCLASS_LOOP (isdigit) 3609146040Stjr else if (strcmp (name, "print") == 0) 3610146040Stjr BUILD_CHARCLASS_LOOP (isprint) 3611146040Stjr else if (strcmp (name, "upper") == 0) 3612146040Stjr BUILD_CHARCLASS_LOOP (isupper) 3613146040Stjr else if (strcmp (name, "blank") == 0) 3614146040Stjr BUILD_CHARCLASS_LOOP (isblank) 3615146040Stjr else if (strcmp (name, "graph") == 0) 3616146040Stjr BUILD_CHARCLASS_LOOP (isgraph) 3617146040Stjr else if (strcmp (name, "punct") == 0) 3618146040Stjr BUILD_CHARCLASS_LOOP (ispunct) 3619146040Stjr else if (strcmp (name, "xdigit") == 0) 3620146040Stjr BUILD_CHARCLASS_LOOP (isxdigit) 3621146040Stjr else 3622146040Stjr return REG_ECTYPE; 3623146040Stjr 3624146040Stjr return REG_NOERROR; 3625146040Stjr} 3626146040Stjr 3627146040Stjrstatic bin_tree_t * 3628146040Stjrbuild_charclass_op (dfa, trans, class_name, extra, non_match, err) 3629146040Stjr re_dfa_t *dfa; 3630146040Stjr unsigned RE_TRANSLATE_TYPE trans; 3631146040Stjr const unsigned char *class_name; 3632146040Stjr const unsigned char *extra; 3633146040Stjr int non_match; 3634146040Stjr reg_errcode_t *err; 3635146040Stjr{ 3636146040Stjr re_bitset_ptr_t sbcset; 3637146040Stjr#ifdef RE_ENABLE_I18N 3638146040Stjr re_charset_t *mbcset; 3639146040Stjr int alloc = 0; 3640146040Stjr#endif /* not RE_ENABLE_I18N */ 3641146040Stjr reg_errcode_t ret; 3642146040Stjr re_token_t br_token; 3643146040Stjr bin_tree_t *tree; 3644146040Stjr 3645146040Stjr sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); 3646146040Stjr#ifdef RE_ENABLE_I18N 3647146040Stjr mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3648146040Stjr#endif /* RE_ENABLE_I18N */ 3649146040Stjr 3650146040Stjr#ifdef RE_ENABLE_I18N 3651146040Stjr if (BE (sbcset == NULL || mbcset == NULL, 0)) 3652146040Stjr#else /* not RE_ENABLE_I18N */ 3653146040Stjr if (BE (sbcset == NULL, 0)) 3654146040Stjr#endif /* not RE_ENABLE_I18N */ 3655146040Stjr { 3656146040Stjr *err = REG_ESPACE; 3657146040Stjr return NULL; 3658146040Stjr } 3659146040Stjr 3660146040Stjr if (non_match) 3661146040Stjr { 3662146040Stjr#ifdef RE_ENABLE_I18N 3663146040Stjr /* 3664146040Stjr if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3665146040Stjr bitset_set(cset->sbcset, '\0'); 3666146040Stjr */ 3667146040Stjr mbcset->non_match = 1; 3668146040Stjr#endif /* not RE_ENABLE_I18N */ 3669146040Stjr } 3670146040Stjr 3671146040Stjr /* We don't care the syntax in this case. */ 3672146040Stjr ret = build_charclass (trans, sbcset, 3673146040Stjr#ifdef RE_ENABLE_I18N 3674146040Stjr mbcset, &alloc, 3675146040Stjr#endif /* RE_ENABLE_I18N */ 3676146040Stjr class_name, 0); 3677146040Stjr 3678146040Stjr if (BE (ret != REG_NOERROR, 0)) 3679146040Stjr { 3680146040Stjr re_free (sbcset); 3681146040Stjr#ifdef RE_ENABLE_I18N 3682146040Stjr free_charset (mbcset); 3683146040Stjr#endif /* RE_ENABLE_I18N */ 3684146040Stjr *err = ret; 3685146040Stjr return NULL; 3686146040Stjr } 3687146040Stjr /* \w match '_' also. */ 3688146040Stjr for (; *extra; extra++) 3689146040Stjr bitset_set (sbcset, *extra); 3690146040Stjr 3691146040Stjr /* If it is non-matching list. */ 3692146040Stjr if (non_match) 3693146040Stjr bitset_not (sbcset); 3694146040Stjr 3695146040Stjr#ifdef RE_ENABLE_I18N 3696146040Stjr /* Ensure only single byte characters are set. */ 3697146040Stjr if (dfa->mb_cur_max > 1) 3698146040Stjr bitset_mask (sbcset, dfa->sb_char); 3699146040Stjr#endif 3700146040Stjr 3701146040Stjr /* Build a tree for simple bracket. */ 3702146040Stjr br_token.type = SIMPLE_BRACKET; 3703146040Stjr br_token.opr.sbcset = sbcset; 3704146040Stjr tree = create_token_tree (dfa, NULL, NULL, &br_token); 3705146040Stjr if (BE (tree == NULL, 0)) 3706146040Stjr goto build_word_op_espace; 3707146040Stjr 3708146040Stjr#ifdef RE_ENABLE_I18N 3709146040Stjr if (dfa->mb_cur_max > 1) 3710146040Stjr { 3711146040Stjr bin_tree_t *mbc_tree; 3712146040Stjr /* Build a tree for complex bracket. */ 3713146040Stjr br_token.type = COMPLEX_BRACKET; 3714146040Stjr br_token.opr.mbcset = mbcset; 3715146040Stjr dfa->has_mb_node = 1; 3716146040Stjr mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3717146040Stjr if (BE (mbc_tree == NULL, 0)) 3718146040Stjr goto build_word_op_espace; 3719146040Stjr /* Then join them by ALT node. */ 3720146040Stjr tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3721146040Stjr if (BE (mbc_tree != NULL, 1)) 3722146040Stjr return tree; 3723146040Stjr } 3724146040Stjr else 3725146040Stjr { 3726146040Stjr free_charset (mbcset); 3727146040Stjr return tree; 3728146040Stjr } 3729146040Stjr#else /* not RE_ENABLE_I18N */ 3730146040Stjr return tree; 3731146040Stjr#endif /* not RE_ENABLE_I18N */ 3732146040Stjr 3733146040Stjr build_word_op_espace: 3734146040Stjr re_free (sbcset); 3735146040Stjr#ifdef RE_ENABLE_I18N 3736146040Stjr free_charset (mbcset); 3737146040Stjr#endif /* RE_ENABLE_I18N */ 3738146040Stjr *err = REG_ESPACE; 3739146040Stjr return NULL; 3740146040Stjr} 3741146040Stjr 3742146040Stjr/* This is intended for the expressions like "a{1,3}". 3743146040Stjr Fetch a number from `input', and return the number. 3744146040Stjr Return -1, if the number field is empty like "{,1}". 3745146040Stjr Return -2, If an error is occured. */ 3746146040Stjr 3747146040Stjrstatic int 3748146040Stjrfetch_number (input, token, syntax) 3749146040Stjr re_string_t *input; 3750146040Stjr re_token_t *token; 3751146040Stjr reg_syntax_t syntax; 3752146040Stjr{ 3753146040Stjr int num = -1; 3754146040Stjr unsigned char c; 3755146040Stjr while (1) 3756146040Stjr { 3757146040Stjr fetch_token (token, input, syntax); 3758146040Stjr c = token->opr.c; 3759146040Stjr if (BE (token->type == END_OF_RE, 0)) 3760146040Stjr return -2; 3761146040Stjr if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3762146040Stjr break; 3763146040Stjr num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) 3764146040Stjr ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0')); 3765146040Stjr num = (num > RE_DUP_MAX) ? -2 : num; 3766146040Stjr } 3767146040Stjr return num; 3768146040Stjr} 3769146040Stjr 3770146040Stjr#ifdef RE_ENABLE_I18N 3771146040Stjrstatic void 3772146040Stjrfree_charset (re_charset_t *cset) 3773146040Stjr{ 3774146040Stjr re_free (cset->mbchars); 3775146040Stjr# ifdef _LIBC 3776146040Stjr re_free (cset->coll_syms); 3777146040Stjr re_free (cset->equiv_classes); 3778146040Stjr re_free (cset->range_starts); 3779146040Stjr re_free (cset->range_ends); 3780146040Stjr# endif 3781146040Stjr re_free (cset->char_classes); 3782146040Stjr re_free (cset); 3783146040Stjr} 3784146040Stjr#endif /* RE_ENABLE_I18N */ 3785146040Stjr 3786146040Stjr/* Functions for binary tree operation. */ 3787146040Stjr 3788146040Stjr/* Create a tree node. */ 3789146040Stjr 3790146040Stjrstatic bin_tree_t * 3791146040Stjrcreate_tree (dfa, left, right, type) 3792146040Stjr re_dfa_t *dfa; 3793146040Stjr bin_tree_t *left; 3794146040Stjr bin_tree_t *right; 3795146040Stjr re_token_type_t type; 3796146040Stjr{ 3797146040Stjr re_token_t t; 3798146040Stjr t.type = type; 3799146040Stjr return create_token_tree (dfa, left, right, &t); 3800146040Stjr} 3801146040Stjr 3802146040Stjrstatic bin_tree_t * 3803146040Stjrcreate_token_tree (dfa, left, right, token) 3804146040Stjr re_dfa_t *dfa; 3805146040Stjr bin_tree_t *left; 3806146040Stjr bin_tree_t *right; 3807146040Stjr const re_token_t *token; 3808146040Stjr{ 3809146040Stjr bin_tree_t *tree; 3810146040Stjr if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3811146040Stjr { 3812146040Stjr bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3813146040Stjr 3814146040Stjr if (storage == NULL) 3815146040Stjr return NULL; 3816146040Stjr storage->next = dfa->str_tree_storage; 3817146040Stjr dfa->str_tree_storage = storage; 3818146040Stjr dfa->str_tree_storage_idx = 0; 3819146040Stjr } 3820146040Stjr tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3821146040Stjr 3822146040Stjr tree->parent = NULL; 3823146040Stjr tree->left = left; 3824146040Stjr tree->right = right; 3825146040Stjr tree->token = *token; 3826146040Stjr tree->token.duplicated = 0; 3827146040Stjr tree->token.opt_subexp = 0; 3828146040Stjr tree->first = NULL; 3829146040Stjr tree->next = NULL; 3830146040Stjr tree->node_idx = -1; 3831146040Stjr 3832146040Stjr if (left != NULL) 3833146040Stjr left->parent = tree; 3834146040Stjr if (right != NULL) 3835146040Stjr right->parent = tree; 3836146040Stjr return tree; 3837146040Stjr} 3838146040Stjr 3839146040Stjr/* Mark the tree SRC as an optional subexpression. 3840146040Stjr To be called from preorder or postorder. */ 3841146040Stjr 3842146040Stjrstatic reg_errcode_t 3843146040Stjrmark_opt_subexp (extra, node) 3844146040Stjr void *extra; 3845146040Stjr bin_tree_t *node; 3846146040Stjr{ 3847146040Stjr int idx = (int) (long) extra; 3848146040Stjr if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3849146040Stjr node->token.opt_subexp = 1; 3850146040Stjr 3851146040Stjr return REG_NOERROR; 3852146040Stjr} 3853146040Stjr 3854146040Stjr/* Free the allocated memory inside NODE. */ 3855146040Stjr 3856146040Stjrstatic void 3857146040Stjrfree_token (re_token_t *node) 3858146040Stjr{ 3859146040Stjr#ifdef RE_ENABLE_I18N 3860146040Stjr if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3861146040Stjr free_charset (node->opr.mbcset); 3862146040Stjr else 3863146040Stjr#endif /* RE_ENABLE_I18N */ 3864146040Stjr if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3865146040Stjr re_free (node->opr.sbcset); 3866146040Stjr} 3867146040Stjr 3868146040Stjr/* Worker function for tree walking. Free the allocated memory inside NODE 3869146040Stjr and its children. */ 3870146040Stjr 3871146040Stjrstatic reg_errcode_t 3872146040Stjrfree_tree (void *extra, bin_tree_t *node) 3873146040Stjr{ 3874146040Stjr free_token (&node->token); 3875146040Stjr return REG_NOERROR; 3876146040Stjr} 3877146040Stjr 3878146040Stjr 3879146040Stjr/* Duplicate the node SRC, and return new node. This is a preorder 3880146040Stjr visit similar to the one implemented by the generic visitor, but 3881146040Stjr we need more infrastructure to maintain two parallel trees --- so, 3882146040Stjr it's easier to duplicate. */ 3883146040Stjr 3884146040Stjrstatic bin_tree_t * 3885146040Stjrduplicate_tree (root, dfa) 3886146040Stjr const bin_tree_t *root; 3887146040Stjr re_dfa_t *dfa; 3888146040Stjr{ 3889146040Stjr const bin_tree_t *node; 3890146040Stjr bin_tree_t *dup_root; 3891146040Stjr bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3892146040Stjr 3893146040Stjr for (node = root; ; ) 3894146040Stjr { 3895146040Stjr /* Create a new tree and link it back to the current parent. */ 3896146040Stjr *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3897146040Stjr if (*p_new == NULL) 3898146040Stjr return NULL; 3899146040Stjr (*p_new)->parent = dup_node; 3900146040Stjr (*p_new)->token.duplicated = 1; 3901146040Stjr dup_node = *p_new; 3902146040Stjr 3903146040Stjr /* Go to the left node, or up and to the right. */ 3904146040Stjr if (node->left) 3905146040Stjr { 3906146040Stjr node = node->left; 3907146040Stjr p_new = &dup_node->left; 3908146040Stjr } 3909146040Stjr else 3910146040Stjr { 3911146040Stjr const bin_tree_t *prev = NULL; 3912146040Stjr while (node->right == prev || node->right == NULL) 3913146040Stjr { 3914146040Stjr prev = node; 3915146040Stjr node = node->parent; 3916146040Stjr dup_node = dup_node->parent; 3917146040Stjr if (!node) 3918146040Stjr return dup_root; 3919146040Stjr } 3920146040Stjr node = node->right; 3921146040Stjr p_new = &dup_node->right; 3922146040Stjr } 3923146040Stjr } 3924146040Stjr} 3925