regcomp.c revision 250723
1/* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, write to the Free 18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19 02111-1307 USA. */ 20 21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 22 int length, reg_syntax_t syntax); 23static void re_compile_fastmap_iter (regex_t *bufp, 24 const re_dfastate_t *init_state, 25 char *fastmap); 26static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len); 27static void init_word_char (re_dfa_t *dfa); 28#ifdef RE_ENABLE_I18N 29static void free_charset (re_charset_t *cset); 30#endif /* RE_ENABLE_I18N */ 31static void free_workarea_compile (regex_t *preg); 32static reg_errcode_t create_initial_state (re_dfa_t *dfa); 33#ifdef RE_ENABLE_I18N 34static void optimize_utf8 (re_dfa_t *dfa); 35#endif 36static reg_errcode_t analyze (regex_t *preg); 37static reg_errcode_t create_initial_state (re_dfa_t *dfa); 38static reg_errcode_t preorder (bin_tree_t *root, 39 reg_errcode_t (fn (void *, bin_tree_t *)), 40 void *extra); 41static reg_errcode_t postorder (bin_tree_t *root, 42 reg_errcode_t (fn (void *, bin_tree_t *)), 43 void *extra); 44static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 45static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 46static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 47 bin_tree_t *node); 48static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 49static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 50static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 51static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node, 52 int top_clone_node, int root_node, 53 unsigned int constraint); 54static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx, 55 unsigned int constraint); 56static int search_duplicated_node (re_dfa_t *dfa, int org_node, 57 unsigned int constraint); 58static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 59static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 60 int node, int root); 61static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 62static int fetch_number (re_string_t *input, re_token_t *token, 63 reg_syntax_t syntax); 64static void fetch_token (re_token_t *result, re_string_t *input, 65 reg_syntax_t syntax); 66static int peek_token (re_token_t *token, re_string_t *input, 67 reg_syntax_t syntax); 68static int peek_token_bracket (re_token_t *token, re_string_t *input, 69 reg_syntax_t syntax); 70static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 71 reg_syntax_t syntax, reg_errcode_t *err); 72static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 73 re_token_t *token, reg_syntax_t syntax, 74 int nest, reg_errcode_t *err); 75static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 76 re_token_t *token, reg_syntax_t syntax, 77 int nest, reg_errcode_t *err); 78static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 79 re_token_t *token, reg_syntax_t syntax, 80 int nest, reg_errcode_t *err); 81static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 82 re_token_t *token, reg_syntax_t syntax, 83 int nest, reg_errcode_t *err); 84static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 85 re_dfa_t *dfa, re_token_t *token, 86 reg_syntax_t syntax, reg_errcode_t *err); 87static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 88 re_token_t *token, reg_syntax_t syntax, 89 reg_errcode_t *err); 90static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 91 re_string_t *regexp, 92 re_token_t *token, int token_len, 93 re_dfa_t *dfa, 94 reg_syntax_t syntax, 95 int accept_hyphen); 96static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 97 re_string_t *regexp, 98 re_token_t *token); 99#ifndef _LIBC 100# ifdef RE_ENABLE_I18N 101static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, 102 re_charset_t *mbcset, int *range_alloc, 103 bracket_elem_t *start_elem, 104 bracket_elem_t *end_elem); 105static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, 106 re_charset_t *mbcset, 107 int *coll_sym_alloc, 108 const unsigned char *name); 109# else /* not RE_ENABLE_I18N */ 110static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, 111 bracket_elem_t *start_elem, 112 bracket_elem_t *end_elem); 113static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, 114 const unsigned char *name); 115# endif /* not RE_ENABLE_I18N */ 116#endif /* not _LIBC */ 117#ifdef RE_ENABLE_I18N 118static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, 119 re_charset_t *mbcset, 120 int *equiv_class_alloc, 121 const unsigned char *name); 122static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans, 123 re_bitset_ptr_t sbcset, 124 re_charset_t *mbcset, 125 int *char_class_alloc, 126 const unsigned char *class_name, 127 reg_syntax_t syntax); 128#else /* not RE_ENABLE_I18N */ 129static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, 130 const unsigned char *name); 131static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans, 132 re_bitset_ptr_t sbcset, 133 const unsigned char *class_name, 134 reg_syntax_t syntax); 135#endif /* not RE_ENABLE_I18N */ 136static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 137 unsigned RE_TRANSLATE_TYPE trans, 138 const unsigned char *class_name, 139 const unsigned char *extra, 140 int non_match, reg_errcode_t *err); 141static bin_tree_t *create_tree (re_dfa_t *dfa, 142 bin_tree_t *left, bin_tree_t *right, 143 re_token_type_t type); 144static bin_tree_t *create_token_tree (re_dfa_t *dfa, 145 bin_tree_t *left, bin_tree_t *right, 146 const re_token_t *token); 147static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 148static void free_token (re_token_t *node); 149static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 150static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 151 152/* This table gives an error message for each of the error codes listed 153 in regex.h. Obviously the order here has to be same as there. 154 POSIX doesn't require that we do anything for REG_NOERROR, 155 but why not be nice? */ 156 157const char __re_error_msgid[] attribute_hidden = 158 { 159#define REG_NOERROR_IDX 0 160 gettext_noop ("Success") /* REG_NOERROR */ 161 "\0" 162#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 163 gettext_noop ("No match") /* REG_NOMATCH */ 164 "\0" 165#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 166 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 167 "\0" 168#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 169 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 170 "\0" 171#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 172 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 173 "\0" 174#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 175 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 176 "\0" 177#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 178 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 179 "\0" 180#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 181 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 182 "\0" 183#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 184 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 185 "\0" 186#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 187 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 188 "\0" 189#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 190 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 191 "\0" 192#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 193 gettext_noop ("Invalid range end") /* REG_ERANGE */ 194 "\0" 195#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 196 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 197 "\0" 198#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 199 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 200 "\0" 201#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 202 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 203 "\0" 204#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 205 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 206 "\0" 207#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 208 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 209 }; 210 211const size_t __re_error_msgid_idx[] attribute_hidden = 212 { 213 REG_NOERROR_IDX, 214 REG_NOMATCH_IDX, 215 REG_BADPAT_IDX, 216 REG_ECOLLATE_IDX, 217 REG_ECTYPE_IDX, 218 REG_EESCAPE_IDX, 219 REG_ESUBREG_IDX, 220 REG_EBRACK_IDX, 221 REG_EPAREN_IDX, 222 REG_EBRACE_IDX, 223 REG_BADBR_IDX, 224 REG_ERANGE_IDX, 225 REG_ESPACE_IDX, 226 REG_BADRPT_IDX, 227 REG_EEND_IDX, 228 REG_ESIZE_IDX, 229 REG_ERPAREN_IDX 230 }; 231 232/* Entry points for GNU code. */ 233 234/* re_compile_pattern is the GNU regular expression compiler: it 235 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 236 Returns 0 if the pattern was valid, otherwise an error string. 237 238 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 239 are set in BUFP on entry. */ 240 241const char * 242re_compile_pattern (pattern, length, bufp) 243 const char *pattern; 244 size_t length; 245 struct re_pattern_buffer *bufp; 246{ 247 reg_errcode_t ret; 248 249 /* And GNU code determines whether or not to get register information 250 by passing null for the REGS argument to re_match, etc., not by 251 setting no_sub, unless RE_NO_SUB is set. */ 252 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 253 254 /* Match anchors at newline. */ 255 bufp->newline_anchor = 1; 256 257 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 258 259 if (!ret) 260 return NULL; 261 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 262} 263#ifdef _LIBC 264weak_alias (__re_compile_pattern, re_compile_pattern) 265#endif 266 267/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 268 also be assigned to arbitrarily: each pattern buffer stores its own 269 syntax, so it can be changed between regex compilations. */ 270/* This has no initializer because initialized variables in Emacs 271 become read-only after dumping. */ 272reg_syntax_t re_syntax_options; 273 274 275/* Specify the precise syntax of regexps for compilation. This provides 276 for compatibility for various utilities which historically have 277 different, incompatible syntaxes. 278 279 The argument SYNTAX is a bit mask comprised of the various bits 280 defined in regex.h. We return the old syntax. */ 281 282reg_syntax_t 283re_set_syntax (syntax) 284 reg_syntax_t syntax; 285{ 286 reg_syntax_t ret = re_syntax_options; 287 288 re_syntax_options = syntax; 289 return ret; 290} 291#ifdef _LIBC 292weak_alias (__re_set_syntax, re_set_syntax) 293#endif 294 295int 296re_compile_fastmap (bufp) 297 struct re_pattern_buffer *bufp; 298{ 299 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 300 char *fastmap = bufp->fastmap; 301 302 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 303 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 304 if (dfa->init_state != dfa->init_state_word) 305 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 306 if (dfa->init_state != dfa->init_state_nl) 307 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 308 if (dfa->init_state != dfa->init_state_begbuf) 309 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 310 bufp->fastmap_accurate = 1; 311 return 0; 312} 313#ifdef _LIBC 314weak_alias (__re_compile_fastmap, re_compile_fastmap) 315#endif 316 317static inline void 318__attribute ((always_inline)) 319re_set_fastmap (char *fastmap, int icase, int ch) 320{ 321 fastmap[ch] = 1; 322 if (icase) 323 fastmap[tolower (ch)] = 1; 324} 325 326/* Helper function for re_compile_fastmap. 327 Compile fastmap for the initial_state INIT_STATE. */ 328 329static void 330re_compile_fastmap_iter (bufp, init_state, fastmap) 331 regex_t *bufp; 332 const re_dfastate_t *init_state; 333 char *fastmap; 334{ 335 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 336 int node_cnt; 337 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 338 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 339 { 340 int node = init_state->nodes.elems[node_cnt]; 341 re_token_type_t type = dfa->nodes[node].type; 342 343 if (type == CHARACTER) 344 { 345 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 346#ifdef RE_ENABLE_I18N 347 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 348 { 349 unsigned char *buf = alloca (dfa->mb_cur_max), *p; 350 wchar_t wc; 351 mbstate_t state; 352 353 p = buf; 354 *p++ = dfa->nodes[node].opr.c; 355 while (++node < dfa->nodes_len 356 && dfa->nodes[node].type == CHARACTER 357 && dfa->nodes[node].mb_partial) 358 *p++ = dfa->nodes[node].opr.c; 359 memset (&state, 0, sizeof (state)); 360 if (mbrtowc (&wc, (const char *) buf, p - buf, 361 &state) == p - buf 362 && (__wcrtomb ((char *) buf, towlower (wc), &state) 363 != (size_t) -1)) 364 re_set_fastmap (fastmap, 0, buf[0]); 365 } 366#endif 367 } 368 else if (type == SIMPLE_BRACKET) 369 { 370 int i, j, ch; 371 for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 372 for (j = 0; j < UINT_BITS; ++j, ++ch) 373 if (dfa->nodes[node].opr.sbcset[i] & (1 << j)) 374 re_set_fastmap (fastmap, icase, ch); 375 } 376#ifdef RE_ENABLE_I18N 377 else if (type == COMPLEX_BRACKET) 378 { 379 int i; 380 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 381 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes 382 || cset->nranges || cset->nchar_classes) 383 { 384# ifdef _LIBC 385 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) 386 { 387 /* In this case we want to catch the bytes which are 388 the first byte of any collation elements. 389 e.g. In da_DK, we want to catch 'a' since "aa" 390 is a valid collation element, and don't catch 391 'b' since 'b' is the only collation element 392 which starts from 'b'. */ 393 int j, ch; 394 const int32_t *table = (const int32_t *) 395 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 396 for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 397 for (j = 0; j < UINT_BITS; ++j, ++ch) 398 if (table[ch] < 0) 399 re_set_fastmap (fastmap, icase, ch); 400 } 401# else 402 if (dfa->mb_cur_max > 1) 403 for (i = 0; i < SBC_MAX; ++i) 404 if (__btowc (i) == WEOF) 405 re_set_fastmap (fastmap, icase, i); 406# endif /* not _LIBC */ 407 } 408 for (i = 0; i < cset->nmbchars; ++i) 409 { 410 char buf[256]; 411 mbstate_t state; 412 memset (&state, '\0', sizeof (state)); 413 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 414 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 415 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 416 { 417 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 418 != (size_t) -1) 419 re_set_fastmap (fastmap, 0, *(unsigned char *) buf); 420 } 421 } 422 } 423#endif /* RE_ENABLE_I18N */ 424 else if (type == OP_PERIOD 425#ifdef RE_ENABLE_I18N 426 || type == OP_UTF8_PERIOD 427#endif /* RE_ENABLE_I18N */ 428 || type == END_OF_RE) 429 { 430 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 431 if (type == END_OF_RE) 432 bufp->can_be_null = 1; 433 return; 434 } 435 } 436} 437 438/* Entry point for POSIX code. */ 439/* regcomp takes a regular expression as a string and compiles it. 440 441 PREG is a regex_t *. We do not expect any fields to be initialized, 442 since POSIX says we shouldn't. Thus, we set 443 444 `buffer' to the compiled pattern; 445 `used' to the length of the compiled pattern; 446 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 447 REG_EXTENDED bit in CFLAGS is set; otherwise, to 448 RE_SYNTAX_POSIX_BASIC; 449 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 450 `fastmap' to an allocated space for the fastmap; 451 `fastmap_accurate' to zero; 452 `re_nsub' to the number of subexpressions in PATTERN. 453 454 PATTERN is the address of the pattern string. 455 456 CFLAGS is a series of bits which affect compilation. 457 458 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 459 use POSIX basic syntax. 460 461 If REG_NEWLINE is set, then . and [^...] don't match newline. 462 Also, regexec will try a match beginning after every newline. 463 464 If REG_ICASE is set, then we considers upper- and lowercase 465 versions of letters to be equivalent when matching. 466 467 If REG_NOSUB is set, then when PREG is passed to regexec, that 468 routine will report only success or failure, and nothing about the 469 registers. 470 471 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 472 the return codes and their meanings.) */ 473 474int 475regcomp (preg, pattern, cflags) 476 regex_t *__restrict preg; 477 const char *__restrict pattern; 478 int cflags; 479{ 480 reg_errcode_t ret; 481 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 482 : RE_SYNTAX_POSIX_BASIC); 483 484 preg->buffer = NULL; 485 preg->allocated = 0; 486 preg->used = 0; 487 488 /* Try to allocate space for the fastmap. */ 489 preg->fastmap = re_malloc (char, SBC_MAX); 490 if (BE (preg->fastmap == NULL, 0)) 491 return REG_ESPACE; 492 493 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 494 495 /* If REG_NEWLINE is set, newlines are treated differently. */ 496 if (cflags & REG_NEWLINE) 497 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 498 syntax &= ~RE_DOT_NEWLINE; 499 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 500 /* It also changes the matching behavior. */ 501 preg->newline_anchor = 1; 502 } 503 else 504 preg->newline_anchor = 0; 505 preg->no_sub = !!(cflags & REG_NOSUB); 506 preg->translate = NULL; 507 508 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 509 510 /* POSIX doesn't distinguish between an unmatched open-group and an 511 unmatched close-group: both are REG_EPAREN. */ 512 if (ret == REG_ERPAREN) 513 ret = REG_EPAREN; 514 515 /* We have already checked preg->fastmap != NULL. */ 516 if (BE (ret == REG_NOERROR, 1)) 517 /* Compute the fastmap now, since regexec cannot modify the pattern 518 buffer. This function never fails in this implementation. */ 519 (void) re_compile_fastmap (preg); 520 else 521 { 522 /* Some error occurred while compiling the expression. */ 523 re_free (preg->fastmap); 524 preg->fastmap = NULL; 525 } 526 527 return (int) ret; 528} 529#ifdef _LIBC 530weak_alias (__regcomp, regcomp) 531#endif 532 533/* Returns a message corresponding to an error code, ERRCODE, returned 534 from either regcomp or regexec. We don't use PREG here. */ 535 536size_t 537regerror (errcode, preg, errbuf, errbuf_size) 538 int errcode; 539 const regex_t *preg; 540 char *errbuf; 541 size_t errbuf_size; 542{ 543 const char *msg; 544 size_t msg_size; 545 546 if (BE (errcode < 0 547 || errcode >= (int) (sizeof (__re_error_msgid_idx) 548 / sizeof (__re_error_msgid_idx[0])), 0)) 549 /* Only error codes returned by the rest of the code should be passed 550 to this routine. If we are given anything else, or if other regex 551 code generates an invalid error code, then the program has a bug. 552 Dump core so we can fix it. */ 553 abort (); 554 555 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 556 557 msg_size = strlen (msg) + 1; /* Includes the null. */ 558 559 if (BE (errbuf_size != 0, 1)) 560 { 561 if (BE (msg_size > errbuf_size, 0)) 562 { 563#if defined HAVE_MEMPCPY || defined _LIBC 564 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 565#else 566 memcpy (errbuf, msg, errbuf_size - 1); 567 errbuf[errbuf_size - 1] = 0; 568#endif 569 } 570 else 571 memcpy (errbuf, msg, msg_size); 572 } 573 574 return msg_size; 575} 576#ifdef _LIBC 577weak_alias (__regerror, regerror) 578#endif 579 580 581#ifdef RE_ENABLE_I18N 582/* This static array is used for the map to single-byte characters when 583 UTF-8 is used. Otherwise we would allocate memory just to initialize 584 it the same all the time. UTF-8 is the preferred encoding so this is 585 a worthwhile optimization. */ 586static const bitset utf8_sb_map = 587{ 588 /* Set the first 128 bits. */ 589# if UINT_MAX == 0xffffffff 590 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff 591# else 592# error "Add case for new unsigned int size" 593# endif 594}; 595#endif 596 597 598static void 599free_dfa_content (re_dfa_t *dfa) 600{ 601 int i, j; 602 603 if (dfa->nodes) 604 for (i = 0; i < dfa->nodes_len; ++i) 605 free_token (dfa->nodes + i); 606 re_free (dfa->nexts); 607 for (i = 0; i < dfa->nodes_len; ++i) 608 { 609 if (dfa->eclosures != NULL) 610 re_node_set_free (dfa->eclosures + i); 611 if (dfa->inveclosures != NULL) 612 re_node_set_free (dfa->inveclosures + i); 613 if (dfa->edests != NULL) 614 re_node_set_free (dfa->edests + i); 615 } 616 re_free (dfa->edests); 617 re_free (dfa->eclosures); 618 re_free (dfa->inveclosures); 619 re_free (dfa->nodes); 620 621 if (dfa->state_table) 622 for (i = 0; i <= dfa->state_hash_mask; ++i) 623 { 624 struct re_state_table_entry *entry = dfa->state_table + i; 625 for (j = 0; j < entry->num; ++j) 626 { 627 re_dfastate_t *state = entry->array[j]; 628 free_state (state); 629 } 630 re_free (entry->array); 631 } 632 re_free (dfa->state_table); 633#ifdef RE_ENABLE_I18N 634 if (dfa->sb_char != utf8_sb_map) 635 re_free (dfa->sb_char); 636#endif 637 re_free (dfa->subexp_map); 638#ifdef DEBUG 639 re_free (dfa->re_str); 640#endif 641 642 re_free (dfa); 643} 644 645 646/* Free dynamically allocated space used by PREG. */ 647 648void 649regfree (preg) 650 regex_t *preg; 651{ 652 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 653 if (BE (dfa != NULL, 1)) 654 free_dfa_content (dfa); 655 preg->buffer = NULL; 656 preg->allocated = 0; 657 658 re_free (preg->fastmap); 659 preg->fastmap = NULL; 660 661 re_free (preg->translate); 662 preg->translate = NULL; 663} 664#ifdef _LIBC 665weak_alias (__regfree, regfree) 666#endif 667 668/* Entry points compatible with 4.2 BSD regex library. We don't define 669 them unless specifically requested. */ 670 671#if defined _REGEX_RE_COMP || defined _LIBC 672 673/* BSD has one and only one pattern buffer. */ 674static struct re_pattern_buffer re_comp_buf; 675 676char * 677# ifdef _LIBC 678/* Make these definitions weak in libc, so POSIX programs can redefine 679 these names if they don't use our functions, and still use 680 regcomp/regexec above without link errors. */ 681weak_function 682# endif 683re_comp (s) 684 const char *s; 685{ 686 reg_errcode_t ret; 687 char *fastmap; 688 689 if (!s) 690 { 691 if (!re_comp_buf.buffer) 692 return gettext ("No previous regular expression"); 693 return 0; 694 } 695 696 if (re_comp_buf.buffer) 697 { 698 fastmap = re_comp_buf.fastmap; 699 re_comp_buf.fastmap = NULL; 700 __regfree (&re_comp_buf); 701 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 702 re_comp_buf.fastmap = fastmap; 703 } 704 705 if (re_comp_buf.fastmap == NULL) 706 { 707 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 708 if (re_comp_buf.fastmap == NULL) 709 return (char *) gettext (__re_error_msgid 710 + __re_error_msgid_idx[(int) REG_ESPACE]); 711 } 712 713 /* Since `re_exec' always passes NULL for the `regs' argument, we 714 don't need to initialize the pattern buffer fields which affect it. */ 715 716 /* Match anchors at newlines. */ 717 re_comp_buf.newline_anchor = 1; 718 719 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 720 721 if (!ret) 722 return NULL; 723 724 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 725 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 726} 727 728#ifdef _LIBC 729libc_freeres_fn (free_mem) 730{ 731 __regfree (&re_comp_buf); 732} 733#endif 734 735#endif /* _REGEX_RE_COMP */ 736 737/* Internal entry point. 738 Compile the regular expression PATTERN, whose length is LENGTH. 739 SYNTAX indicate regular expression's syntax. */ 740 741static reg_errcode_t 742re_compile_internal (preg, pattern, length, syntax) 743 regex_t *preg; 744 const char * pattern; 745 int length; 746 reg_syntax_t syntax; 747{ 748 reg_errcode_t err = REG_NOERROR; 749 re_dfa_t *dfa; 750 re_string_t regexp; 751 752 /* Initialize the pattern buffer. */ 753 preg->fastmap_accurate = 0; 754 preg->syntax = syntax; 755 preg->not_bol = preg->not_eol = 0; 756 preg->used = 0; 757 preg->re_nsub = 0; 758 preg->can_be_null = 0; 759 preg->regs_allocated = REGS_UNALLOCATED; 760 761 /* Initialize the dfa. */ 762 dfa = (re_dfa_t *) preg->buffer; 763 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 764 { 765 /* If zero allocated, but buffer is non-null, try to realloc 766 enough space. This loses if buffer's address is bogus, but 767 that is the user's responsibility. If ->buffer is NULL this 768 is a simple allocation. */ 769 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 770 if (dfa == NULL) 771 return REG_ESPACE; 772 preg->allocated = sizeof (re_dfa_t); 773 preg->buffer = (unsigned char *) dfa; 774 } 775 preg->used = sizeof (re_dfa_t); 776 777 err = init_dfa (dfa, length); 778 if (BE (err != REG_NOERROR, 0)) 779 { 780 free_dfa_content (dfa); 781 preg->buffer = NULL; 782 preg->allocated = 0; 783 return err; 784 } 785#ifdef DEBUG 786 dfa->re_str = re_malloc (char, length + 1); 787 strncpy (dfa->re_str, pattern, length + 1); 788#endif 789 790 err = re_string_construct (®exp, pattern, length, preg->translate, 791 syntax & RE_ICASE, dfa); 792 if (BE (err != REG_NOERROR, 0)) 793 { 794 re_compile_internal_free_return: 795 free_workarea_compile (preg); 796 re_string_destruct (®exp); 797 free_dfa_content (dfa); 798 preg->buffer = NULL; 799 preg->allocated = 0; 800 return err; 801 } 802 803 /* Parse the regular expression, and build a structure tree. */ 804 preg->re_nsub = 0; 805 dfa->str_tree = parse (®exp, preg, syntax, &err); 806 if (BE (dfa->str_tree == NULL, 0)) 807 goto re_compile_internal_free_return; 808 809 /* Analyze the tree and create the nfa. */ 810 err = analyze (preg); 811 if (BE (err != REG_NOERROR, 0)) 812 goto re_compile_internal_free_return; 813 814#ifdef RE_ENABLE_I18N 815 /* If possible, do searching in single byte encoding to speed things up. */ 816 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 817 optimize_utf8 (dfa); 818#endif 819 820 /* Then create the initial state of the dfa. */ 821 err = create_initial_state (dfa); 822 823 /* Release work areas. */ 824 free_workarea_compile (preg); 825 re_string_destruct (®exp); 826 827 if (BE (err != REG_NOERROR, 0)) 828 { 829 free_dfa_content (dfa); 830 preg->buffer = NULL; 831 preg->allocated = 0; 832 } 833 834 return err; 835} 836 837/* Initialize DFA. We use the length of the regular expression PAT_LEN 838 as the initial length of some arrays. */ 839 840static reg_errcode_t 841init_dfa (dfa, pat_len) 842 re_dfa_t *dfa; 843 int pat_len; 844{ 845 int table_size; 846#ifndef _LIBC 847 char *codeset_name; 848#endif 849 850 memset (dfa, '\0', sizeof (re_dfa_t)); 851 852 /* Force allocation of str_tree_storage the first time. */ 853 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 854 855 dfa->nodes_alloc = pat_len + 1; 856 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 857 858 dfa->states_alloc = pat_len + 1; 859 860 /* table_size = 2 ^ ceil(log pat_len) */ 861 for (table_size = 1; table_size > 0; table_size <<= 1) 862 if (table_size > pat_len) 863 break; 864 865 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 866 dfa->state_hash_mask = table_size - 1; 867 868 dfa->mb_cur_max = MB_CUR_MAX; 869#ifdef _LIBC 870 if (dfa->mb_cur_max == 6 871 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 872 dfa->is_utf8 = 1; 873 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 874 != 0); 875#else 876# ifdef HAVE_LANGINFO_CODESET 877 codeset_name = nl_langinfo (CODESET); 878# else 879 codeset_name = getenv ("LC_ALL"); 880 if (codeset_name == NULL || codeset_name[0] == '\0') 881 codeset_name = getenv ("LC_CTYPE"); 882 if (codeset_name == NULL || codeset_name[0] == '\0') 883 codeset_name = getenv ("LANG"); 884 if (codeset_name == NULL) 885 codeset_name = ""; 886 else if (strchr (codeset_name, '.') != NULL) 887 codeset_name = strchr (codeset_name, '.') + 1; 888# endif 889 890 if (strcasecmp (codeset_name, "UTF-8") == 0 891 || strcasecmp (codeset_name, "UTF8") == 0) 892 dfa->is_utf8 = 1; 893 894 /* We check exhaustively in the loop below if this charset is a 895 superset of ASCII. */ 896 dfa->map_notascii = 0; 897#endif 898 899#ifdef RE_ENABLE_I18N 900 if (dfa->mb_cur_max > 1) 901 { 902 if (dfa->is_utf8) 903 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 904 else 905 { 906 int i, j, ch; 907 908 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); 909 if (BE (dfa->sb_char == NULL, 0)) 910 return REG_ESPACE; 911 912 /* Clear all bits by, then set those corresponding to single 913 byte chars. */ 914 bitset_empty (dfa->sb_char); 915 916 for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 917 for (j = 0; j < UINT_BITS; ++j, ++ch) 918 { 919 wchar_t wch = __btowc (ch); 920 if (wch != WEOF) 921 dfa->sb_char[i] |= 1 << j; 922# ifndef _LIBC 923 if (isascii (ch) && wch != (wchar_t) ch) 924 dfa->map_notascii = 1; 925# endif 926 } 927 } 928 } 929#endif 930 931 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 932 return REG_ESPACE; 933 return REG_NOERROR; 934} 935 936/* Initialize WORD_CHAR table, which indicate which character is 937 "word". In this case "word" means that it is the word construction 938 character used by some operators like "\<", "\>", etc. */ 939 940static void 941init_word_char (dfa) 942 re_dfa_t *dfa; 943{ 944 int i, j, ch; 945 dfa->word_ops_used = 1; 946 for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 947 for (j = 0; j < UINT_BITS; ++j, ++ch) 948 if (isalnum (ch) || ch == '_') 949 dfa->word_char[i] |= 1 << j; 950} 951 952/* Free the work area which are only used while compiling. */ 953 954static void 955free_workarea_compile (preg) 956 regex_t *preg; 957{ 958 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 959 bin_tree_storage_t *storage, *next; 960 for (storage = dfa->str_tree_storage; storage; storage = next) 961 { 962 next = storage->next; 963 re_free (storage); 964 } 965 dfa->str_tree_storage = NULL; 966 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 967 dfa->str_tree = NULL; 968 re_free (dfa->org_indices); 969 dfa->org_indices = NULL; 970} 971 972/* Create initial states for all contexts. */ 973 974static reg_errcode_t 975create_initial_state (dfa) 976 re_dfa_t *dfa; 977{ 978 int first, i; 979 reg_errcode_t err; 980 re_node_set init_nodes; 981 982 /* Initial states have the epsilon closure of the node which is 983 the first node of the regular expression. */ 984 first = dfa->str_tree->first->node_idx; 985 dfa->init_node = first; 986 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 987 if (BE (err != REG_NOERROR, 0)) 988 return err; 989 990 /* The back-references which are in initial states can epsilon transit, 991 since in this case all of the subexpressions can be null. 992 Then we add epsilon closures of the nodes which are the next nodes of 993 the back-references. */ 994 if (dfa->nbackref > 0) 995 for (i = 0; i < init_nodes.nelem; ++i) 996 { 997 int node_idx = init_nodes.elems[i]; 998 re_token_type_t type = dfa->nodes[node_idx].type; 999 1000 int clexp_idx; 1001 if (type != OP_BACK_REF) 1002 continue; 1003 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1004 { 1005 re_token_t *clexp_node; 1006 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1007 if (clexp_node->type == OP_CLOSE_SUBEXP 1008 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1009 break; 1010 } 1011 if (clexp_idx == init_nodes.nelem) 1012 continue; 1013 1014 if (type == OP_BACK_REF) 1015 { 1016 int dest_idx = dfa->edests[node_idx].elems[0]; 1017 if (!re_node_set_contains (&init_nodes, dest_idx)) 1018 { 1019 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1020 i = 0; 1021 } 1022 } 1023 } 1024 1025 /* It must be the first time to invoke acquire_state. */ 1026 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1027 /* We don't check ERR here, since the initial state must not be NULL. */ 1028 if (BE (dfa->init_state == NULL, 0)) 1029 return err; 1030 if (dfa->init_state->has_constraint) 1031 { 1032 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1033 CONTEXT_WORD); 1034 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1035 CONTEXT_NEWLINE); 1036 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1037 &init_nodes, 1038 CONTEXT_NEWLINE 1039 | CONTEXT_BEGBUF); 1040 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1041 || dfa->init_state_begbuf == NULL, 0)) 1042 return err; 1043 } 1044 else 1045 dfa->init_state_word = dfa->init_state_nl 1046 = dfa->init_state_begbuf = dfa->init_state; 1047 1048 re_node_set_free (&init_nodes); 1049 return REG_NOERROR; 1050} 1051 1052#ifdef RE_ENABLE_I18N 1053/* If it is possible to do searching in single byte encoding instead of UTF-8 1054 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1055 DFA nodes where needed. */ 1056 1057static void 1058optimize_utf8 (dfa) 1059 re_dfa_t *dfa; 1060{ 1061 int node, i, mb_chars = 0, has_period = 0; 1062 1063 for (node = 0; node < dfa->nodes_len; ++node) 1064 switch (dfa->nodes[node].type) 1065 { 1066 case CHARACTER: 1067 if (dfa->nodes[node].opr.c >= 0x80) 1068 mb_chars = 1; 1069 break; 1070 case ANCHOR: 1071 switch (dfa->nodes[node].opr.idx) 1072 { 1073 case LINE_FIRST: 1074 case LINE_LAST: 1075 case BUF_FIRST: 1076 case BUF_LAST: 1077 break; 1078 default: 1079 /* Word anchors etc. cannot be handled. */ 1080 return; 1081 } 1082 break; 1083 case OP_PERIOD: 1084 has_period = 1; 1085 break; 1086 case OP_BACK_REF: 1087 case OP_ALT: 1088 case END_OF_RE: 1089 case OP_DUP_ASTERISK: 1090 case OP_OPEN_SUBEXP: 1091 case OP_CLOSE_SUBEXP: 1092 break; 1093 case COMPLEX_BRACKET: 1094 return; 1095 case SIMPLE_BRACKET: 1096 /* Just double check. */ 1097 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i) 1098 if (dfa->nodes[node].opr.sbcset[i]) 1099 return; 1100 break; 1101 default: 1102 abort (); 1103 } 1104 1105 if (mb_chars || has_period) 1106 for (node = 0; node < dfa->nodes_len; ++node) 1107 { 1108 if (dfa->nodes[node].type == CHARACTER 1109 && dfa->nodes[node].opr.c >= 0x80) 1110 dfa->nodes[node].mb_partial = 0; 1111 else if (dfa->nodes[node].type == OP_PERIOD) 1112 dfa->nodes[node].type = OP_UTF8_PERIOD; 1113 } 1114 1115 /* The search can be in single byte locale. */ 1116 dfa->mb_cur_max = 1; 1117 dfa->is_utf8 = 0; 1118 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1119} 1120#endif 1121 1122/* Analyze the structure tree, and calculate "first", "next", "edest", 1123 "eclosure", and "inveclosure". */ 1124 1125static reg_errcode_t 1126analyze (preg) 1127 regex_t *preg; 1128{ 1129 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1130 reg_errcode_t ret; 1131 1132 /* Allocate arrays. */ 1133 dfa->nexts = re_malloc (int, dfa->nodes_alloc); 1134 dfa->org_indices = re_malloc (int, dfa->nodes_alloc); 1135 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1136 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1137 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1138 || dfa->eclosures == NULL, 0)) 1139 return REG_ESPACE; 1140 1141 dfa->subexp_map = re_malloc (int, preg->re_nsub); 1142 if (dfa->subexp_map != NULL) 1143 { 1144 int i; 1145 for (i = 0; i < preg->re_nsub; i++) 1146 dfa->subexp_map[i] = i; 1147 preorder (dfa->str_tree, optimize_subexps, dfa); 1148 for (i = 0; i < preg->re_nsub; i++) 1149 if (dfa->subexp_map[i] != i) 1150 break; 1151 if (i == preg->re_nsub) 1152 { 1153 free (dfa->subexp_map); 1154 dfa->subexp_map = NULL; 1155 } 1156 } 1157 1158 ret = postorder (dfa->str_tree, lower_subexps, preg); 1159 if (BE (ret != REG_NOERROR, 0)) 1160 return ret; 1161 ret = postorder (dfa->str_tree, calc_first, dfa); 1162 if (BE (ret != REG_NOERROR, 0)) 1163 return ret; 1164 preorder (dfa->str_tree, calc_next, dfa); 1165 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1166 if (BE (ret != REG_NOERROR, 0)) 1167 return ret; 1168 ret = calc_eclosure (dfa); 1169 if (BE (ret != REG_NOERROR, 0)) 1170 return ret; 1171 1172 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1173 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1174 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1175 || dfa->nbackref) 1176 { 1177 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1178 if (BE (dfa->inveclosures == NULL, 0)) 1179 return REG_ESPACE; 1180 ret = calc_inveclosure (dfa); 1181 } 1182 1183 return ret; 1184} 1185 1186/* Our parse trees are very unbalanced, so we cannot use a stack to 1187 implement parse tree visits. Instead, we use parent pointers and 1188 some hairy code in these two functions. */ 1189static reg_errcode_t 1190postorder (root, fn, extra) 1191 bin_tree_t *root; 1192 reg_errcode_t (fn (void *, bin_tree_t *)); 1193 void *extra; 1194{ 1195 bin_tree_t *node, *prev; 1196 1197 for (node = root; ; ) 1198 { 1199 /* Descend down the tree, preferably to the left (or to the right 1200 if that's the only child). */ 1201 while (node->left || node->right) 1202 if (node->left) 1203 node = node->left; 1204 else 1205 node = node->right; 1206 1207 do 1208 { 1209 reg_errcode_t err = fn (extra, node); 1210 if (BE (err != REG_NOERROR, 0)) 1211 return err; 1212 if (node->parent == NULL) 1213 return REG_NOERROR; 1214 prev = node; 1215 node = node->parent; 1216 } 1217 /* Go up while we have a node that is reached from the right. */ 1218 while (node->right == prev || node->right == NULL); 1219 node = node->right; 1220 } 1221} 1222 1223static reg_errcode_t 1224preorder (root, fn, extra) 1225 bin_tree_t *root; 1226 reg_errcode_t (fn (void *, bin_tree_t *)); 1227 void *extra; 1228{ 1229 bin_tree_t *node; 1230 1231 for (node = root; ; ) 1232 { 1233 reg_errcode_t err = fn (extra, node); 1234 if (BE (err != REG_NOERROR, 0)) 1235 return err; 1236 1237 /* Go to the left node, or up and to the right. */ 1238 if (node->left) 1239 node = node->left; 1240 else 1241 { 1242 bin_tree_t *prev = NULL; 1243 while (node->right == prev || node->right == NULL) 1244 { 1245 prev = node; 1246 node = node->parent; 1247 if (!node) 1248 return REG_NOERROR; 1249 } 1250 node = node->right; 1251 } 1252 } 1253} 1254 1255/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1256 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1257 backreferences as well. Requires a preorder visit. */ 1258static reg_errcode_t 1259optimize_subexps (extra, node) 1260 void *extra; 1261 bin_tree_t *node; 1262{ 1263 re_dfa_t *dfa = (re_dfa_t *) extra; 1264 1265 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1266 { 1267 int idx = node->token.opr.idx; 1268 node->token.opr.idx = dfa->subexp_map[idx]; 1269 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1270 } 1271 1272 else if (node->token.type == SUBEXP 1273 && node->left && node->left->token.type == SUBEXP) 1274 { 1275 int other_idx = node->left->token.opr.idx; 1276 1277 node->left = node->left->left; 1278 if (node->left) 1279 node->left->parent = node; 1280 1281 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1282 if (other_idx < 8 * sizeof (dfa->used_bkref_map)) 1283 dfa->used_bkref_map &= ~(1 << other_idx); 1284 } 1285 1286 return REG_NOERROR; 1287} 1288 1289/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1290 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1291static reg_errcode_t 1292lower_subexps (extra, node) 1293 void *extra; 1294 bin_tree_t *node; 1295{ 1296 regex_t *preg = (regex_t *) extra; 1297 reg_errcode_t err = REG_NOERROR; 1298 1299 if (node->left && node->left->token.type == SUBEXP) 1300 { 1301 node->left = lower_subexp (&err, preg, node->left); 1302 if (node->left) 1303 node->left->parent = node; 1304 } 1305 if (node->right && node->right->token.type == SUBEXP) 1306 { 1307 node->right = lower_subexp (&err, preg, node->right); 1308 if (node->right) 1309 node->right->parent = node; 1310 } 1311 1312 return err; 1313} 1314 1315static bin_tree_t * 1316lower_subexp (err, preg, node) 1317 reg_errcode_t *err; 1318 regex_t *preg; 1319 bin_tree_t *node; 1320{ 1321 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1322 bin_tree_t *body = node->left; 1323 bin_tree_t *op, *cls, *tree1, *tree; 1324 1325 if (preg->no_sub 1326 /* We do not optimize empty subexpressions, because otherwise we may 1327 have bad CONCAT nodes with NULL children. This is obviously not 1328 very common, so we do not lose much. An example that triggers 1329 this case is the sed "script" /\(\)/x. */ 1330 && node->left != NULL 1331 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map) 1332 || !(dfa->used_bkref_map & (1 << node->token.opr.idx)))) 1333 return node->left; 1334 1335 /* Convert the SUBEXP node to the concatenation of an 1336 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1337 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1338 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1339 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1340 tree = create_tree (dfa, op, tree1, CONCAT); 1341 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1342 { 1343 *err = REG_ESPACE; 1344 return NULL; 1345 } 1346 1347 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1348 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1349 return tree; 1350} 1351 1352/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1353 nodes. Requires a postorder visit. */ 1354static reg_errcode_t 1355calc_first (extra, node) 1356 void *extra; 1357 bin_tree_t *node; 1358{ 1359 re_dfa_t *dfa = (re_dfa_t *) extra; 1360 if (node->token.type == CONCAT) 1361 { 1362 node->first = node->left->first; 1363 node->node_idx = node->left->node_idx; 1364 } 1365 else 1366 { 1367 node->first = node; 1368 node->node_idx = re_dfa_add_node (dfa, node->token); 1369 if (BE (node->node_idx == -1, 0)) 1370 return REG_ESPACE; 1371 } 1372 return REG_NOERROR; 1373} 1374 1375/* Pass 2: compute NEXT on the tree. Preorder visit. */ 1376static reg_errcode_t 1377calc_next (extra, node) 1378 void *extra; 1379 bin_tree_t *node; 1380{ 1381 switch (node->token.type) 1382 { 1383 case OP_DUP_ASTERISK: 1384 node->left->next = node; 1385 break; 1386 case CONCAT: 1387 node->left->next = node->right->first; 1388 node->right->next = node->next; 1389 break; 1390 default: 1391 if (node->left) 1392 node->left->next = node->next; 1393 if (node->right) 1394 node->right->next = node->next; 1395 break; 1396 } 1397 return REG_NOERROR; 1398} 1399 1400/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1401static reg_errcode_t 1402link_nfa_nodes (extra, node) 1403 void *extra; 1404 bin_tree_t *node; 1405{ 1406 re_dfa_t *dfa = (re_dfa_t *) extra; 1407 int idx = node->node_idx; 1408 reg_errcode_t err = REG_NOERROR; 1409 1410 switch (node->token.type) 1411 { 1412 case CONCAT: 1413 break; 1414 1415 case END_OF_RE: 1416 assert (node->next == NULL); 1417 break; 1418 1419 case OP_DUP_ASTERISK: 1420 case OP_ALT: 1421 { 1422 int left, right; 1423 dfa->has_plural_match = 1; 1424 if (node->left != NULL) 1425 left = node->left->first->node_idx; 1426 else 1427 left = node->next->node_idx; 1428 if (node->right != NULL) 1429 right = node->right->first->node_idx; 1430 else 1431 right = node->next->node_idx; 1432 assert (left > -1); 1433 assert (right > -1); 1434 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1435 } 1436 break; 1437 1438 case ANCHOR: 1439 case OP_OPEN_SUBEXP: 1440 case OP_CLOSE_SUBEXP: 1441 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1442 break; 1443 1444 case OP_BACK_REF: 1445 dfa->nexts[idx] = node->next->node_idx; 1446 if (node->token.type == OP_BACK_REF) 1447 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1448 break; 1449 1450 default: 1451 assert (!IS_EPSILON_NODE (node->token.type)); 1452 dfa->nexts[idx] = node->next->node_idx; 1453 break; 1454 } 1455 1456 return err; 1457} 1458 1459/* Duplicate the epsilon closure of the node ROOT_NODE. 1460 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1461 to their own constraint. */ 1462 1463static reg_errcode_t 1464duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, 1465 init_constraint) 1466 re_dfa_t *dfa; 1467 int top_org_node, top_clone_node, root_node; 1468 unsigned int init_constraint; 1469{ 1470 reg_errcode_t err; 1471 int org_node, clone_node, ret; 1472 unsigned int constraint = init_constraint; 1473 for (org_node = top_org_node, clone_node = top_clone_node;;) 1474 { 1475 int org_dest, clone_dest; 1476 if (dfa->nodes[org_node].type == OP_BACK_REF) 1477 { 1478 /* If the back reference epsilon-transit, its destination must 1479 also have the constraint. Then duplicate the epsilon closure 1480 of the destination of the back reference, and store it in 1481 edests of the back reference. */ 1482 org_dest = dfa->nexts[org_node]; 1483 re_node_set_empty (dfa->edests + clone_node); 1484 err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1485 if (BE (err != REG_NOERROR, 0)) 1486 return err; 1487 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1488 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1489 if (BE (ret < 0, 0)) 1490 return REG_ESPACE; 1491 } 1492 else if (dfa->edests[org_node].nelem == 0) 1493 { 1494 /* In case of the node can't epsilon-transit, don't duplicate the 1495 destination and store the original destination as the 1496 destination of the node. */ 1497 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1498 break; 1499 } 1500 else if (dfa->edests[org_node].nelem == 1) 1501 { 1502 /* In case of the node can epsilon-transit, and it has only one 1503 destination. */ 1504 org_dest = dfa->edests[org_node].elems[0]; 1505 re_node_set_empty (dfa->edests + clone_node); 1506 if (dfa->nodes[org_node].type == ANCHOR) 1507 { 1508 /* In case of the node has another constraint, append it. */ 1509 if (org_node == root_node && clone_node != org_node) 1510 { 1511 /* ...but if the node is root_node itself, it means the 1512 epsilon closure have a loop, then tie it to the 1513 destination of the root_node. */ 1514 ret = re_node_set_insert (dfa->edests + clone_node, 1515 org_dest); 1516 if (BE (ret < 0, 0)) 1517 return REG_ESPACE; 1518 break; 1519 } 1520 constraint |= dfa->nodes[org_node].opr.ctx_type; 1521 } 1522 err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1523 if (BE (err != REG_NOERROR, 0)) 1524 return err; 1525 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1526 if (BE (ret < 0, 0)) 1527 return REG_ESPACE; 1528 } 1529 else /* dfa->edests[org_node].nelem == 2 */ 1530 { 1531 /* In case of the node can epsilon-transit, and it has two 1532 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1533 org_dest = dfa->edests[org_node].elems[0]; 1534 re_node_set_empty (dfa->edests + clone_node); 1535 /* Search for a duplicated node which satisfies the constraint. */ 1536 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1537 if (clone_dest == -1) 1538 { 1539 /* There are no such a duplicated node, create a new one. */ 1540 err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1541 if (BE (err != REG_NOERROR, 0)) 1542 return err; 1543 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1544 if (BE (ret < 0, 0)) 1545 return REG_ESPACE; 1546 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1547 root_node, constraint); 1548 if (BE (err != REG_NOERROR, 0)) 1549 return err; 1550 } 1551 else 1552 { 1553 /* There are a duplicated node which satisfy the constraint, 1554 use it to avoid infinite loop. */ 1555 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1556 if (BE (ret < 0, 0)) 1557 return REG_ESPACE; 1558 } 1559 1560 org_dest = dfa->edests[org_node].elems[1]; 1561 err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1562 if (BE (err != REG_NOERROR, 0)) 1563 return err; 1564 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1565 if (BE (ret < 0, 0)) 1566 return REG_ESPACE; 1567 } 1568 org_node = org_dest; 1569 clone_node = clone_dest; 1570 } 1571 return REG_NOERROR; 1572} 1573 1574/* Search for a node which is duplicated from the node ORG_NODE, and 1575 satisfies the constraint CONSTRAINT. */ 1576 1577static int 1578search_duplicated_node (dfa, org_node, constraint) 1579 re_dfa_t *dfa; 1580 int org_node; 1581 unsigned int constraint; 1582{ 1583 int idx; 1584 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1585 { 1586 if (org_node == dfa->org_indices[idx] 1587 && constraint == dfa->nodes[idx].constraint) 1588 return idx; /* Found. */ 1589 } 1590 return -1; /* Not found. */ 1591} 1592 1593/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1594 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded, 1595 otherwise return the error code. */ 1596 1597static reg_errcode_t 1598duplicate_node (new_idx, dfa, org_idx, constraint) 1599 re_dfa_t *dfa; 1600 int *new_idx, org_idx; 1601 unsigned int constraint; 1602{ 1603 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1604 if (BE (dup_idx == -1, 0)) 1605 return REG_ESPACE; 1606 dfa->nodes[dup_idx].constraint = constraint; 1607 if (dfa->nodes[org_idx].type == ANCHOR) 1608 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; 1609 dfa->nodes[dup_idx].duplicated = 1; 1610 1611 /* Store the index of the original node. */ 1612 dfa->org_indices[dup_idx] = org_idx; 1613 *new_idx = dup_idx; 1614 return REG_NOERROR; 1615} 1616 1617static reg_errcode_t 1618calc_inveclosure (dfa) 1619 re_dfa_t *dfa; 1620{ 1621 int src, idx, ret; 1622 for (idx = 0; idx < dfa->nodes_len; ++idx) 1623 re_node_set_init_empty (dfa->inveclosures + idx); 1624 1625 for (src = 0; src < dfa->nodes_len; ++src) 1626 { 1627 int *elems = dfa->eclosures[src].elems; 1628 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1629 { 1630 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1631 if (BE (ret == -1, 0)) 1632 return REG_ESPACE; 1633 } 1634 } 1635 1636 return REG_NOERROR; 1637} 1638 1639/* Calculate "eclosure" for all the node in DFA. */ 1640 1641static reg_errcode_t 1642calc_eclosure (dfa) 1643 re_dfa_t *dfa; 1644{ 1645 int node_idx, incomplete; 1646#ifdef DEBUG 1647 assert (dfa->nodes_len > 0); 1648#endif 1649 incomplete = 0; 1650 /* For each nodes, calculate epsilon closure. */ 1651 for (node_idx = 0; ; ++node_idx) 1652 { 1653 reg_errcode_t err; 1654 re_node_set eclosure_elem; 1655 if (node_idx == dfa->nodes_len) 1656 { 1657 if (!incomplete) 1658 break; 1659 incomplete = 0; 1660 node_idx = 0; 1661 } 1662 1663#ifdef DEBUG 1664 assert (dfa->eclosures[node_idx].nelem != -1); 1665#endif 1666 1667 /* If we have already calculated, skip it. */ 1668 if (dfa->eclosures[node_idx].nelem != 0) 1669 continue; 1670 /* Calculate epsilon closure of `node_idx'. */ 1671 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1); 1672 if (BE (err != REG_NOERROR, 0)) 1673 return err; 1674 1675 if (dfa->eclosures[node_idx].nelem == 0) 1676 { 1677 incomplete = 1; 1678 re_node_set_free (&eclosure_elem); 1679 } 1680 } 1681 return REG_NOERROR; 1682} 1683 1684/* Calculate epsilon closure of NODE. */ 1685 1686static reg_errcode_t 1687calc_eclosure_iter (new_set, dfa, node, root) 1688 re_node_set *new_set; 1689 re_dfa_t *dfa; 1690 int node, root; 1691{ 1692 reg_errcode_t err; 1693 unsigned int constraint; 1694 int i, incomplete; 1695 re_node_set eclosure; 1696 incomplete = 0; 1697 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1698 if (BE (err != REG_NOERROR, 0)) 1699 return err; 1700 1701 /* This indicates that we are calculating this node now. 1702 We reference this value to avoid infinite loop. */ 1703 dfa->eclosures[node].nelem = -1; 1704 1705 constraint = ((dfa->nodes[node].type == ANCHOR) 1706 ? dfa->nodes[node].opr.ctx_type : 0); 1707 /* If the current node has constraints, duplicate all nodes. 1708 Since they must inherit the constraints. */ 1709 if (constraint 1710 && dfa->edests[node].nelem 1711 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1712 { 1713 int org_node, cur_node; 1714 org_node = cur_node = node; 1715 err = duplicate_node_closure (dfa, node, node, node, constraint); 1716 if (BE (err != REG_NOERROR, 0)) 1717 return err; 1718 } 1719 1720 /* Expand each epsilon destination nodes. */ 1721 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1722 for (i = 0; i < dfa->edests[node].nelem; ++i) 1723 { 1724 re_node_set eclosure_elem; 1725 int edest = dfa->edests[node].elems[i]; 1726 /* If calculating the epsilon closure of `edest' is in progress, 1727 return intermediate result. */ 1728 if (dfa->eclosures[edest].nelem == -1) 1729 { 1730 incomplete = 1; 1731 continue; 1732 } 1733 /* If we haven't calculated the epsilon closure of `edest' yet, 1734 calculate now. Otherwise use calculated epsilon closure. */ 1735 if (dfa->eclosures[edest].nelem == 0) 1736 { 1737 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0); 1738 if (BE (err != REG_NOERROR, 0)) 1739 return err; 1740 } 1741 else 1742 eclosure_elem = dfa->eclosures[edest]; 1743 /* Merge the epsilon closure of `edest'. */ 1744 re_node_set_merge (&eclosure, &eclosure_elem); 1745 /* If the epsilon closure of `edest' is incomplete, 1746 the epsilon closure of this node is also incomplete. */ 1747 if (dfa->eclosures[edest].nelem == 0) 1748 { 1749 incomplete = 1; 1750 re_node_set_free (&eclosure_elem); 1751 } 1752 } 1753 1754 /* Epsilon closures include itself. */ 1755 re_node_set_insert (&eclosure, node); 1756 if (incomplete && !root) 1757 dfa->eclosures[node].nelem = 0; 1758 else 1759 dfa->eclosures[node] = eclosure; 1760 *new_set = eclosure; 1761 return REG_NOERROR; 1762} 1763 1764/* Functions for token which are used in the parser. */ 1765 1766/* Fetch a token from INPUT. 1767 We must not use this function inside bracket expressions. */ 1768 1769static void 1770fetch_token (result, input, syntax) 1771 re_token_t *result; 1772 re_string_t *input; 1773 reg_syntax_t syntax; 1774{ 1775 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1776} 1777 1778/* Peek a token from INPUT, and return the length of the token. 1779 We must not use this function inside bracket expressions. */ 1780 1781static int 1782peek_token (token, input, syntax) 1783 re_token_t *token; 1784 re_string_t *input; 1785 reg_syntax_t syntax; 1786{ 1787 unsigned char c; 1788 1789 if (re_string_eoi (input)) 1790 { 1791 token->type = END_OF_RE; 1792 return 0; 1793 } 1794 1795 c = re_string_peek_byte (input, 0); 1796 token->opr.c = c; 1797 1798 token->word_char = 0; 1799#ifdef RE_ENABLE_I18N 1800 token->mb_partial = 0; 1801 if (input->mb_cur_max > 1 && 1802 !re_string_first_byte (input, re_string_cur_idx (input))) 1803 { 1804 token->type = CHARACTER; 1805 token->mb_partial = 1; 1806 return 1; 1807 } 1808#endif 1809 if (c == '\\') 1810 { 1811 unsigned char c2; 1812 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1813 { 1814 token->type = BACK_SLASH; 1815 return 1; 1816 } 1817 1818 c2 = re_string_peek_byte_case (input, 1); 1819 token->opr.c = c2; 1820 token->type = CHARACTER; 1821#ifdef RE_ENABLE_I18N 1822 if (input->mb_cur_max > 1) 1823 { 1824 wint_t wc = re_string_wchar_at (input, 1825 re_string_cur_idx (input) + 1); 1826 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1827 } 1828 else 1829#endif 1830 token->word_char = IS_WORD_CHAR (c2) != 0; 1831 1832 switch (c2) 1833 { 1834 case '|': 1835 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1836 token->type = OP_ALT; 1837 break; 1838 case '1': case '2': case '3': case '4': case '5': 1839 case '6': case '7': case '8': case '9': 1840 if (!(syntax & RE_NO_BK_REFS)) 1841 { 1842 token->type = OP_BACK_REF; 1843 token->opr.idx = c2 - '1'; 1844 } 1845 break; 1846 case '<': 1847 if (!(syntax & RE_NO_GNU_OPS)) 1848 { 1849 token->type = ANCHOR; 1850 token->opr.ctx_type = WORD_FIRST; 1851 } 1852 break; 1853 case '>': 1854 if (!(syntax & RE_NO_GNU_OPS)) 1855 { 1856 token->type = ANCHOR; 1857 token->opr.ctx_type = WORD_LAST; 1858 } 1859 break; 1860 case 'b': 1861 if (!(syntax & RE_NO_GNU_OPS)) 1862 { 1863 token->type = ANCHOR; 1864 token->opr.ctx_type = WORD_DELIM; 1865 } 1866 break; 1867 case 'B': 1868 if (!(syntax & RE_NO_GNU_OPS)) 1869 { 1870 token->type = ANCHOR; 1871 token->opr.ctx_type = NOT_WORD_DELIM; 1872 } 1873 break; 1874 case 'w': 1875 if (!(syntax & RE_NO_GNU_OPS)) 1876 token->type = OP_WORD; 1877 break; 1878 case 'W': 1879 if (!(syntax & RE_NO_GNU_OPS)) 1880 token->type = OP_NOTWORD; 1881 break; 1882 case 's': 1883 if (!(syntax & RE_NO_GNU_OPS)) 1884 token->type = OP_SPACE; 1885 break; 1886 case 'S': 1887 if (!(syntax & RE_NO_GNU_OPS)) 1888 token->type = OP_NOTSPACE; 1889 break; 1890 case '`': 1891 if (!(syntax & RE_NO_GNU_OPS)) 1892 { 1893 token->type = ANCHOR; 1894 token->opr.ctx_type = BUF_FIRST; 1895 } 1896 break; 1897 case '\'': 1898 if (!(syntax & RE_NO_GNU_OPS)) 1899 { 1900 token->type = ANCHOR; 1901 token->opr.ctx_type = BUF_LAST; 1902 } 1903 break; 1904 case '(': 1905 if (!(syntax & RE_NO_BK_PARENS)) 1906 token->type = OP_OPEN_SUBEXP; 1907 break; 1908 case ')': 1909 if (!(syntax & RE_NO_BK_PARENS)) 1910 token->type = OP_CLOSE_SUBEXP; 1911 break; 1912 case '+': 1913 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1914 token->type = OP_DUP_PLUS; 1915 break; 1916 case '?': 1917 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1918 token->type = OP_DUP_QUESTION; 1919 break; 1920 case '{': 1921 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1922 token->type = OP_OPEN_DUP_NUM; 1923 break; 1924 case '}': 1925 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1926 token->type = OP_CLOSE_DUP_NUM; 1927 break; 1928 default: 1929 break; 1930 } 1931 return 2; 1932 } 1933 1934 token->type = CHARACTER; 1935#ifdef RE_ENABLE_I18N 1936 if (input->mb_cur_max > 1) 1937 { 1938 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1939 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1940 } 1941 else 1942#endif 1943 token->word_char = IS_WORD_CHAR (token->opr.c); 1944 1945 switch (c) 1946 { 1947 case '\n': 1948 if (syntax & RE_NEWLINE_ALT) 1949 token->type = OP_ALT; 1950 break; 1951 case '|': 1952 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1953 token->type = OP_ALT; 1954 break; 1955 case '*': 1956 token->type = OP_DUP_ASTERISK; 1957 break; 1958 case '+': 1959 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1960 token->type = OP_DUP_PLUS; 1961 break; 1962 case '?': 1963 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1964 token->type = OP_DUP_QUESTION; 1965 break; 1966 case '{': 1967 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1968 token->type = OP_OPEN_DUP_NUM; 1969 break; 1970 case '}': 1971 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1972 token->type = OP_CLOSE_DUP_NUM; 1973 break; 1974 case '(': 1975 if (syntax & RE_NO_BK_PARENS) 1976 token->type = OP_OPEN_SUBEXP; 1977 break; 1978 case ')': 1979 if (syntax & RE_NO_BK_PARENS) 1980 token->type = OP_CLOSE_SUBEXP; 1981 break; 1982 case '[': 1983 token->type = OP_OPEN_BRACKET; 1984 break; 1985 case '.': 1986 token->type = OP_PERIOD; 1987 break; 1988 case '^': 1989 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 1990 re_string_cur_idx (input) != 0) 1991 { 1992 char prev = re_string_peek_byte (input, -1); 1993 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 1994 break; 1995 } 1996 token->type = ANCHOR; 1997 token->opr.ctx_type = LINE_FIRST; 1998 break; 1999 case '$': 2000 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 2001 re_string_cur_idx (input) + 1 != re_string_length (input)) 2002 { 2003 re_token_t next; 2004 re_string_skip_bytes (input, 1); 2005 peek_token (&next, input, syntax); 2006 re_string_skip_bytes (input, -1); 2007 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 2008 break; 2009 } 2010 token->type = ANCHOR; 2011 token->opr.ctx_type = LINE_LAST; 2012 break; 2013 default: 2014 break; 2015 } 2016 return 1; 2017} 2018 2019/* Peek a token from INPUT, and return the length of the token. 2020 We must not use this function out of bracket expressions. */ 2021 2022static int 2023peek_token_bracket (token, input, syntax) 2024 re_token_t *token; 2025 re_string_t *input; 2026 reg_syntax_t syntax; 2027{ 2028 unsigned char c; 2029 if (re_string_eoi (input)) 2030 { 2031 token->type = END_OF_RE; 2032 return 0; 2033 } 2034 c = re_string_peek_byte (input, 0); 2035 token->opr.c = c; 2036 2037#ifdef RE_ENABLE_I18N 2038 if (input->mb_cur_max > 1 && 2039 !re_string_first_byte (input, re_string_cur_idx (input))) 2040 { 2041 token->type = CHARACTER; 2042 return 1; 2043 } 2044#endif /* RE_ENABLE_I18N */ 2045 2046 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2047 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2048 { 2049 /* In this case, '\' escape a character. */ 2050 unsigned char c2; 2051 re_string_skip_bytes (input, 1); 2052 c2 = re_string_peek_byte (input, 0); 2053 token->opr.c = c2; 2054 token->type = CHARACTER; 2055 return 1; 2056 } 2057 if (c == '[') /* '[' is a special char in a bracket exps. */ 2058 { 2059 unsigned char c2; 2060 int token_len; 2061 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2062 c2 = re_string_peek_byte (input, 1); 2063 else 2064 c2 = 0; 2065 token->opr.c = c2; 2066 token_len = 2; 2067 switch (c2) 2068 { 2069 case '.': 2070 token->type = OP_OPEN_COLL_ELEM; 2071 break; 2072 case '=': 2073 token->type = OP_OPEN_EQUIV_CLASS; 2074 break; 2075 case ':': 2076 if (syntax & RE_CHAR_CLASSES) 2077 { 2078 token->type = OP_OPEN_CHAR_CLASS; 2079 break; 2080 } 2081 /* else fall through. */ 2082 default: 2083 token->type = CHARACTER; 2084 token->opr.c = c; 2085 token_len = 1; 2086 break; 2087 } 2088 return token_len; 2089 } 2090 switch (c) 2091 { 2092 case '-': 2093 token->type = OP_CHARSET_RANGE; 2094 break; 2095 case ']': 2096 token->type = OP_CLOSE_BRACKET; 2097 break; 2098 case '^': 2099 token->type = OP_NON_MATCH_LIST; 2100 break; 2101 default: 2102 token->type = CHARACTER; 2103 } 2104 return 1; 2105} 2106 2107/* Functions for parser. */ 2108 2109/* Entry point of the parser. 2110 Parse the regular expression REGEXP and return the structure tree. 2111 If an error is occured, ERR is set by error code, and return NULL. 2112 This function build the following tree, from regular expression <reg_exp>: 2113 CAT 2114 / \ 2115 / \ 2116 <reg_exp> EOR 2117 2118 CAT means concatenation. 2119 EOR means end of regular expression. */ 2120 2121static bin_tree_t * 2122parse (regexp, preg, syntax, err) 2123 re_string_t *regexp; 2124 regex_t *preg; 2125 reg_syntax_t syntax; 2126 reg_errcode_t *err; 2127{ 2128 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2129 bin_tree_t *tree, *eor, *root; 2130 re_token_t current_token; 2131 dfa->syntax = syntax; 2132 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2133 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2134 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2135 return NULL; 2136 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2137 if (tree != NULL) 2138 root = create_tree (dfa, tree, eor, CONCAT); 2139 else 2140 root = eor; 2141 if (BE (eor == NULL || root == NULL, 0)) 2142 { 2143 *err = REG_ESPACE; 2144 return NULL; 2145 } 2146 return root; 2147} 2148 2149/* This function build the following tree, from regular expression 2150 <branch1>|<branch2>: 2151 ALT 2152 / \ 2153 / \ 2154 <branch1> <branch2> 2155 2156 ALT means alternative, which represents the operator `|'. */ 2157 2158static bin_tree_t * 2159parse_reg_exp (regexp, preg, token, syntax, nest, err) 2160 re_string_t *regexp; 2161 regex_t *preg; 2162 re_token_t *token; 2163 reg_syntax_t syntax; 2164 int nest; 2165 reg_errcode_t *err; 2166{ 2167 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2168 bin_tree_t *tree, *branch = NULL; 2169 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2170 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2171 return NULL; 2172 2173 while (token->type == OP_ALT) 2174 { 2175 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2176 if (token->type != OP_ALT && token->type != END_OF_RE 2177 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2178 { 2179 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2180 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2181 return NULL; 2182 } 2183 else 2184 branch = NULL; 2185 tree = create_tree (dfa, tree, branch, OP_ALT); 2186 if (BE (tree == NULL, 0)) 2187 { 2188 *err = REG_ESPACE; 2189 return NULL; 2190 } 2191 } 2192 return tree; 2193} 2194 2195/* This function build the following tree, from regular expression 2196 <exp1><exp2>: 2197 CAT 2198 / \ 2199 / \ 2200 <exp1> <exp2> 2201 2202 CAT means concatenation. */ 2203 2204static bin_tree_t * 2205parse_branch (regexp, preg, token, syntax, nest, err) 2206 re_string_t *regexp; 2207 regex_t *preg; 2208 re_token_t *token; 2209 reg_syntax_t syntax; 2210 int nest; 2211 reg_errcode_t *err; 2212{ 2213 bin_tree_t *tree, *exp; 2214 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2215 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2216 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2217 return NULL; 2218 2219 while (token->type != OP_ALT && token->type != END_OF_RE 2220 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2221 { 2222 exp = parse_expression (regexp, preg, token, syntax, nest, err); 2223 if (BE (*err != REG_NOERROR && exp == NULL, 0)) 2224 { 2225 return NULL; 2226 } 2227 if (tree != NULL && exp != NULL) 2228 { 2229 tree = create_tree (dfa, tree, exp, CONCAT); 2230 if (tree == NULL) 2231 { 2232 *err = REG_ESPACE; 2233 return NULL; 2234 } 2235 } 2236 else if (tree == NULL) 2237 tree = exp; 2238 /* Otherwise exp == NULL, we don't need to create new tree. */ 2239 } 2240 return tree; 2241} 2242 2243/* This function build the following tree, from regular expression a*: 2244 * 2245 | 2246 a 2247*/ 2248 2249static bin_tree_t * 2250parse_expression (regexp, preg, token, syntax, nest, err) 2251 re_string_t *regexp; 2252 regex_t *preg; 2253 re_token_t *token; 2254 reg_syntax_t syntax; 2255 int nest; 2256 reg_errcode_t *err; 2257{ 2258 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2259 bin_tree_t *tree; 2260 switch (token->type) 2261 { 2262 case CHARACTER: 2263 tree = create_token_tree (dfa, NULL, NULL, token); 2264 if (BE (tree == NULL, 0)) 2265 { 2266 *err = REG_ESPACE; 2267 return NULL; 2268 } 2269#ifdef RE_ENABLE_I18N 2270 if (dfa->mb_cur_max > 1) 2271 { 2272 while (!re_string_eoi (regexp) 2273 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2274 { 2275 bin_tree_t *mbc_remain; 2276 fetch_token (token, regexp, syntax); 2277 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2278 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2279 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2280 { 2281 *err = REG_ESPACE; 2282 return NULL; 2283 } 2284 } 2285 } 2286#endif 2287 break; 2288 case OP_OPEN_SUBEXP: 2289 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2290 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2291 return NULL; 2292 break; 2293 case OP_OPEN_BRACKET: 2294 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2295 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2296 return NULL; 2297 break; 2298 case OP_BACK_REF: 2299 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2300 { 2301 *err = REG_ESUBREG; 2302 return NULL; 2303 } 2304 dfa->used_bkref_map |= 1 << token->opr.idx; 2305 tree = create_token_tree (dfa, NULL, NULL, token); 2306 if (BE (tree == NULL, 0)) 2307 { 2308 *err = REG_ESPACE; 2309 return NULL; 2310 } 2311 ++dfa->nbackref; 2312 dfa->has_mb_node = 1; 2313 break; 2314 case OP_OPEN_DUP_NUM: 2315 if (syntax & RE_CONTEXT_INVALID_DUP) 2316 { 2317 *err = REG_BADRPT; 2318 return NULL; 2319 } 2320 /* FALLTHROUGH */ 2321 case OP_DUP_ASTERISK: 2322 case OP_DUP_PLUS: 2323 case OP_DUP_QUESTION: 2324 if (syntax & RE_CONTEXT_INVALID_OPS) 2325 { 2326 *err = REG_BADRPT; 2327 return NULL; 2328 } 2329 else if (syntax & RE_CONTEXT_INDEP_OPS) 2330 { 2331 fetch_token (token, regexp, syntax); 2332 return parse_expression (regexp, preg, token, syntax, nest, err); 2333 } 2334 /* else fall through */ 2335 case OP_CLOSE_SUBEXP: 2336 if ((token->type == OP_CLOSE_SUBEXP) && 2337 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2338 { 2339 *err = REG_ERPAREN; 2340 return NULL; 2341 } 2342 /* else fall through */ 2343 case OP_CLOSE_DUP_NUM: 2344 /* We treat it as a normal character. */ 2345 2346 /* Then we can these characters as normal characters. */ 2347 token->type = CHARACTER; 2348 /* mb_partial and word_char bits should be initialized already 2349 by peek_token. */ 2350 tree = create_token_tree (dfa, NULL, NULL, token); 2351 if (BE (tree == NULL, 0)) 2352 { 2353 *err = REG_ESPACE; 2354 return NULL; 2355 } 2356 break; 2357 case ANCHOR: 2358 if ((token->opr.ctx_type 2359 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2360 && dfa->word_ops_used == 0) 2361 init_word_char (dfa); 2362 if (token->opr.ctx_type == WORD_DELIM 2363 || token->opr.ctx_type == NOT_WORD_DELIM) 2364 { 2365 bin_tree_t *tree_first, *tree_last; 2366 if (token->opr.ctx_type == WORD_DELIM) 2367 { 2368 token->opr.ctx_type = WORD_FIRST; 2369 tree_first = create_token_tree (dfa, NULL, NULL, token); 2370 token->opr.ctx_type = WORD_LAST; 2371 } 2372 else 2373 { 2374 token->opr.ctx_type = INSIDE_WORD; 2375 tree_first = create_token_tree (dfa, NULL, NULL, token); 2376 token->opr.ctx_type = INSIDE_NOTWORD; 2377 } 2378 tree_last = create_token_tree (dfa, NULL, NULL, token); 2379 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2380 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2381 { 2382 *err = REG_ESPACE; 2383 return NULL; 2384 } 2385 } 2386 else 2387 { 2388 tree = create_token_tree (dfa, NULL, NULL, token); 2389 if (BE (tree == NULL, 0)) 2390 { 2391 *err = REG_ESPACE; 2392 return NULL; 2393 } 2394 } 2395 /* We must return here, since ANCHORs can't be followed 2396 by repetition operators. 2397 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2398 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2399 fetch_token (token, regexp, syntax); 2400 return tree; 2401 case OP_PERIOD: 2402 tree = create_token_tree (dfa, NULL, NULL, token); 2403 if (BE (tree == NULL, 0)) 2404 { 2405 *err = REG_ESPACE; 2406 return NULL; 2407 } 2408 if (dfa->mb_cur_max > 1) 2409 dfa->has_mb_node = 1; 2410 break; 2411 case OP_WORD: 2412 case OP_NOTWORD: 2413 tree = build_charclass_op (dfa, regexp->trans, 2414 (const unsigned char *) "alnum", 2415 (const unsigned char *) "_", 2416 token->type == OP_NOTWORD, err); 2417 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2418 return NULL; 2419 break; 2420 case OP_SPACE: 2421 case OP_NOTSPACE: 2422 tree = build_charclass_op (dfa, regexp->trans, 2423 (const unsigned char *) "space", 2424 (const unsigned char *) "", 2425 token->type == OP_NOTSPACE, err); 2426 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2427 return NULL; 2428 break; 2429 case OP_ALT: 2430 case END_OF_RE: 2431 return NULL; 2432 case BACK_SLASH: 2433 *err = REG_EESCAPE; 2434 return NULL; 2435 default: 2436 /* Must not happen? */ 2437#ifdef DEBUG 2438 assert (0); 2439#endif 2440 return NULL; 2441 } 2442 fetch_token (token, regexp, syntax); 2443 2444 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2445 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2446 { 2447 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2448 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2449 return NULL; 2450 /* In BRE consecutive duplications are not allowed. */ 2451 if ((syntax & RE_CONTEXT_INVALID_DUP) 2452 && (token->type == OP_DUP_ASTERISK 2453 || token->type == OP_OPEN_DUP_NUM)) 2454 { 2455 *err = REG_BADRPT; 2456 return NULL; 2457 } 2458 } 2459 2460 return tree; 2461} 2462 2463/* This function build the following tree, from regular expression 2464 (<reg_exp>): 2465 SUBEXP 2466 | 2467 <reg_exp> 2468*/ 2469 2470static bin_tree_t * 2471parse_sub_exp (regexp, preg, token, syntax, nest, err) 2472 re_string_t *regexp; 2473 regex_t *preg; 2474 re_token_t *token; 2475 reg_syntax_t syntax; 2476 int nest; 2477 reg_errcode_t *err; 2478{ 2479 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2480 bin_tree_t *tree; 2481 size_t cur_nsub; 2482 cur_nsub = preg->re_nsub++; 2483 2484 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2485 2486 /* The subexpression may be a null string. */ 2487 if (token->type == OP_CLOSE_SUBEXP) 2488 tree = NULL; 2489 else 2490 { 2491 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2492 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2493 *err = REG_EPAREN; 2494 if (BE (*err != REG_NOERROR, 0)) 2495 return NULL; 2496 } 2497 dfa->completed_bkref_map |= 1 << cur_nsub; 2498 2499 tree = create_tree (dfa, tree, NULL, SUBEXP); 2500 if (BE (tree == NULL, 0)) 2501 { 2502 *err = REG_ESPACE; 2503 return NULL; 2504 } 2505 tree->token.opr.idx = cur_nsub; 2506 return tree; 2507} 2508 2509/* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2510 2511static bin_tree_t * 2512parse_dup_op (elem, regexp, dfa, token, syntax, err) 2513 bin_tree_t *elem; 2514 re_string_t *regexp; 2515 re_dfa_t *dfa; 2516 re_token_t *token; 2517 reg_syntax_t syntax; 2518 reg_errcode_t *err; 2519{ 2520 bin_tree_t *tree = NULL, *old_tree = NULL; 2521 int i, start, end, start_idx = re_string_cur_idx (regexp); 2522 re_token_t start_token = *token; 2523 2524 if (token->type == OP_OPEN_DUP_NUM) 2525 { 2526 end = 0; 2527 start = fetch_number (regexp, token, syntax); 2528 if (start == -1) 2529 { 2530 if (token->type == CHARACTER && token->opr.c == ',') 2531 start = 0; /* We treat "{,m}" as "{0,m}". */ 2532 else 2533 { 2534 *err = REG_BADBR; /* <re>{} is invalid. */ 2535 return NULL; 2536 } 2537 } 2538 if (BE (start != -2, 1)) 2539 { 2540 /* We treat "{n}" as "{n,n}". */ 2541 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2542 : ((token->type == CHARACTER && token->opr.c == ',') 2543 ? fetch_number (regexp, token, syntax) : -2)); 2544 } 2545 if (BE (start == -2 || end == -2, 0)) 2546 { 2547 /* Invalid sequence. */ 2548 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2549 { 2550 if (token->type == END_OF_RE) 2551 *err = REG_EBRACE; 2552 else 2553 *err = REG_BADBR; 2554 2555 return NULL; 2556 } 2557 2558 /* If the syntax bit is set, rollback. */ 2559 re_string_set_index (regexp, start_idx); 2560 *token = start_token; 2561 token->type = CHARACTER; 2562 /* mb_partial and word_char bits should be already initialized by 2563 peek_token. */ 2564 return elem; 2565 } 2566 2567 if (BE (end != -1 && start > end, 0)) 2568 { 2569 /* First number greater than second. */ 2570 *err = REG_BADBR; 2571 return NULL; 2572 } 2573 } 2574 else 2575 { 2576 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2577 end = (token->type == OP_DUP_QUESTION) ? 1 : -1; 2578 } 2579 2580 fetch_token (token, regexp, syntax); 2581 2582 if (BE (elem == NULL, 0)) 2583 return NULL; 2584 if (BE (start == 0 && end == 0, 0)) 2585 { 2586 postorder (elem, free_tree, NULL); 2587 return NULL; 2588 } 2589 2590 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2591 if (BE (start > 0, 0)) 2592 { 2593 tree = elem; 2594 for (i = 2; i <= start; ++i) 2595 { 2596 elem = duplicate_tree (elem, dfa); 2597 tree = create_tree (dfa, tree, elem, CONCAT); 2598 if (BE (elem == NULL || tree == NULL, 0)) 2599 goto parse_dup_op_espace; 2600 } 2601 2602 if (start == end) 2603 return tree; 2604 2605 /* Duplicate ELEM before it is marked optional. */ 2606 elem = duplicate_tree (elem, dfa); 2607 old_tree = tree; 2608 } 2609 else 2610 old_tree = NULL; 2611 2612 if (elem->token.type == SUBEXP) 2613 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2614 2615 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT)); 2616 if (BE (tree == NULL, 0)) 2617 goto parse_dup_op_espace; 2618 2619 /* This loop is actually executed only when end != -1, 2620 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2621 already created the start+1-th copy. */ 2622 for (i = start + 2; i <= end; ++i) 2623 { 2624 elem = duplicate_tree (elem, dfa); 2625 tree = create_tree (dfa, tree, elem, CONCAT); 2626 if (BE (elem == NULL || tree == NULL, 0)) 2627 goto parse_dup_op_espace; 2628 2629 tree = create_tree (dfa, tree, NULL, OP_ALT); 2630 if (BE (tree == NULL, 0)) 2631 goto parse_dup_op_espace; 2632 } 2633 2634 if (old_tree) 2635 tree = create_tree (dfa, old_tree, tree, CONCAT); 2636 2637 return tree; 2638 2639 parse_dup_op_espace: 2640 *err = REG_ESPACE; 2641 return NULL; 2642} 2643 2644/* Size of the names for collating symbol/equivalence_class/character_class. 2645 I'm not sure, but maybe enough. */ 2646#define BRACKET_NAME_BUF_SIZE 32 2647 2648#ifndef _LIBC 2649 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2650 Build the range expression which starts from START_ELEM, and ends 2651 at END_ELEM. The result are written to MBCSET and SBCSET. 2652 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2653 mbcset->range_ends, is a pointer argument sinse we may 2654 update it. */ 2655 2656static reg_errcode_t 2657# ifdef RE_ENABLE_I18N 2658build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2659 re_charset_t *mbcset; 2660 int *range_alloc; 2661# else /* not RE_ENABLE_I18N */ 2662build_range_exp (sbcset, start_elem, end_elem) 2663# endif /* not RE_ENABLE_I18N */ 2664 re_bitset_ptr_t sbcset; 2665 bracket_elem_t *start_elem, *end_elem; 2666{ 2667 unsigned int start_ch, end_ch; 2668 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2669 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2670 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2671 0)) 2672 return REG_ERANGE; 2673 2674 /* We can handle no multi character collating elements without libc 2675 support. */ 2676 if (BE ((start_elem->type == COLL_SYM 2677 && strlen ((char *) start_elem->opr.name) > 1) 2678 || (end_elem->type == COLL_SYM 2679 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2680 return REG_ECOLLATE; 2681 2682# ifdef RE_ENABLE_I18N 2683 { 2684 wchar_t wc, start_wc, end_wc; 2685 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2686 2687 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2688 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2689 : 0)); 2690 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2691 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2692 : 0)); 2693 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2694 ? __btowc (start_ch) : start_elem->opr.wch); 2695 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2696 ? __btowc (end_ch) : end_elem->opr.wch); 2697 if (start_wc == WEOF || end_wc == WEOF) 2698 return REG_ECOLLATE; 2699 cmp_buf[0] = start_wc; 2700 cmp_buf[4] = end_wc; 2701 if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2702 return REG_ERANGE; 2703 2704 /* Got valid collation sequence values, add them as a new entry. 2705 However, for !_LIBC we have no collation elements: if the 2706 character set is single byte, the single byte character set 2707 that we build below suffices. parse_bracket_exp passes 2708 no MBCSET if dfa->mb_cur_max == 1. */ 2709 if (mbcset) 2710 { 2711 /* Check the space of the arrays. */ 2712 if (BE (*range_alloc == mbcset->nranges, 0)) 2713 { 2714 /* There is not enough space, need realloc. */ 2715 wchar_t *new_array_start, *new_array_end; 2716 int new_nranges; 2717 2718 /* +1 in case of mbcset->nranges is 0. */ 2719 new_nranges = 2 * mbcset->nranges + 1; 2720 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2721 are NULL if *range_alloc == 0. */ 2722 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2723 new_nranges); 2724 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2725 new_nranges); 2726 2727 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2728 return REG_ESPACE; 2729 2730 mbcset->range_starts = new_array_start; 2731 mbcset->range_ends = new_array_end; 2732 *range_alloc = new_nranges; 2733 } 2734 2735 mbcset->range_starts[mbcset->nranges] = start_wc; 2736 mbcset->range_ends[mbcset->nranges++] = end_wc; 2737 } 2738 2739 /* Build the table for single byte characters. */ 2740 for (wc = 0; wc < SBC_MAX; ++wc) 2741 { 2742 cmp_buf[2] = wc; 2743 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2744 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2745 bitset_set (sbcset, wc); 2746 } 2747 } 2748# else /* not RE_ENABLE_I18N */ 2749 { 2750 unsigned int ch; 2751 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2752 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2753 : 0)); 2754 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2755 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2756 : 0)); 2757 if (start_ch > end_ch) 2758 return REG_ERANGE; 2759 /* Build the table for single byte characters. */ 2760 for (ch = 0; ch < SBC_MAX; ++ch) 2761 if (start_ch <= ch && ch <= end_ch) 2762 bitset_set (sbcset, ch); 2763 } 2764# endif /* not RE_ENABLE_I18N */ 2765 return REG_NOERROR; 2766} 2767#endif /* not _LIBC */ 2768 2769#ifndef _LIBC 2770/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2771 Build the collating element which is represented by NAME. 2772 The result are written to MBCSET and SBCSET. 2773 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2774 pointer argument since we may update it. */ 2775 2776static reg_errcode_t 2777# ifdef RE_ENABLE_I18N 2778build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2779 re_charset_t *mbcset; 2780 int *coll_sym_alloc; 2781# else /* not RE_ENABLE_I18N */ 2782build_collating_symbol (sbcset, name) 2783# endif /* not RE_ENABLE_I18N */ 2784 re_bitset_ptr_t sbcset; 2785 const unsigned char *name; 2786{ 2787 size_t name_len = strlen ((const char *) name); 2788 if (BE (name_len != 1, 0)) 2789 return REG_ECOLLATE; 2790 else 2791 { 2792 bitset_set (sbcset, name[0]); 2793 return REG_NOERROR; 2794 } 2795} 2796#endif /* not _LIBC */ 2797 2798/* This function parse bracket expression like "[abc]", "[a-c]", 2799 "[[.a-a.]]" etc. */ 2800 2801static bin_tree_t * 2802parse_bracket_exp (regexp, dfa, token, syntax, err) 2803 re_string_t *regexp; 2804 re_dfa_t *dfa; 2805 re_token_t *token; 2806 reg_syntax_t syntax; 2807 reg_errcode_t *err; 2808{ 2809#ifdef _LIBC 2810 const unsigned char *collseqmb; 2811 const char *collseqwc; 2812 uint32_t nrules; 2813 int32_t table_size; 2814 const int32_t *symb_table; 2815 const unsigned char *extra; 2816 2817 /* Local function for parse_bracket_exp used in _LIBC environement. 2818 Seek the collating symbol entry correspondings to NAME. 2819 Return the index of the symbol in the SYMB_TABLE. */ 2820 2821 auto inline int32_t 2822 __attribute ((always_inline)) 2823 seek_collating_symbol_entry (name, name_len) 2824 const unsigned char *name; 2825 size_t name_len; 2826 { 2827 int32_t hash = elem_hash ((const char *) name, name_len); 2828 int32_t elem = hash % table_size; 2829 int32_t second = hash % (table_size - 2); 2830 while (symb_table[2 * elem] != 0) 2831 { 2832 /* First compare the hashing value. */ 2833 if (symb_table[2 * elem] == hash 2834 /* Compare the length of the name. */ 2835 && name_len == extra[symb_table[2 * elem + 1]] 2836 /* Compare the name. */ 2837 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2838 name_len) == 0) 2839 { 2840 /* Yep, this is the entry. */ 2841 break; 2842 } 2843 2844 /* Next entry. */ 2845 elem += second; 2846 } 2847 return elem; 2848 } 2849 2850 /* Local function for parse_bracket_exp used in _LIBC environement. 2851 Look up the collation sequence value of BR_ELEM. 2852 Return the value if succeeded, UINT_MAX otherwise. */ 2853 2854 auto inline unsigned int 2855 __attribute ((always_inline)) 2856 lookup_collation_sequence_value (br_elem) 2857 bracket_elem_t *br_elem; 2858 { 2859 if (br_elem->type == SB_CHAR) 2860 { 2861 /* 2862 if (MB_CUR_MAX == 1) 2863 */ 2864 if (nrules == 0) 2865 return collseqmb[br_elem->opr.ch]; 2866 else 2867 { 2868 wint_t wc = __btowc (br_elem->opr.ch); 2869 return __collseq_table_lookup (collseqwc, wc); 2870 } 2871 } 2872 else if (br_elem->type == MB_CHAR) 2873 { 2874 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2875 } 2876 else if (br_elem->type == COLL_SYM) 2877 { 2878 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2879 if (nrules != 0) 2880 { 2881 int32_t elem, idx; 2882 elem = seek_collating_symbol_entry (br_elem->opr.name, 2883 sym_name_len); 2884 if (symb_table[2 * elem] != 0) 2885 { 2886 /* We found the entry. */ 2887 idx = symb_table[2 * elem + 1]; 2888 /* Skip the name of collating element name. */ 2889 idx += 1 + extra[idx]; 2890 /* Skip the byte sequence of the collating element. */ 2891 idx += 1 + extra[idx]; 2892 /* Adjust for the alignment. */ 2893 idx = (idx + 3) & ~3; 2894 /* Skip the multibyte collation sequence value. */ 2895 idx += sizeof (unsigned int); 2896 /* Skip the wide char sequence of the collating element. */ 2897 idx += sizeof (unsigned int) * 2898 (1 + *(unsigned int *) (extra + idx)); 2899 /* Return the collation sequence value. */ 2900 return *(unsigned int *) (extra + idx); 2901 } 2902 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2903 { 2904 /* No valid character. Match it as a single byte 2905 character. */ 2906 return collseqmb[br_elem->opr.name[0]]; 2907 } 2908 } 2909 else if (sym_name_len == 1) 2910 return collseqmb[br_elem->opr.name[0]]; 2911 } 2912 return UINT_MAX; 2913 } 2914 2915 /* Local function for parse_bracket_exp used in _LIBC environement. 2916 Build the range expression which starts from START_ELEM, and ends 2917 at END_ELEM. The result are written to MBCSET and SBCSET. 2918 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2919 mbcset->range_ends, is a pointer argument sinse we may 2920 update it. */ 2921 2922 auto inline reg_errcode_t 2923 __attribute ((always_inline)) 2924 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2925 re_charset_t *mbcset; 2926 int *range_alloc; 2927 re_bitset_ptr_t sbcset; 2928 bracket_elem_t *start_elem, *end_elem; 2929 { 2930 unsigned int ch; 2931 uint32_t start_collseq; 2932 uint32_t end_collseq; 2933 2934 /* Equivalence Classes and Character Classes can't be a range 2935 start/end. */ 2936 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2937 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2938 0)) 2939 return REG_ERANGE; 2940 2941 start_collseq = lookup_collation_sequence_value (start_elem); 2942 end_collseq = lookup_collation_sequence_value (end_elem); 2943 /* Check start/end collation sequence values. */ 2944 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2945 return REG_ECOLLATE; 2946 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2947 return REG_ERANGE; 2948 2949 /* Got valid collation sequence values, add them as a new entry. 2950 However, if we have no collation elements, and the character set 2951 is single byte, the single byte character set that we 2952 build below suffices. */ 2953 if (nrules > 0 || dfa->mb_cur_max > 1) 2954 { 2955 /* Check the space of the arrays. */ 2956 if (BE (*range_alloc == mbcset->nranges, 0)) 2957 { 2958 /* There is not enough space, need realloc. */ 2959 uint32_t *new_array_start; 2960 uint32_t *new_array_end; 2961 int new_nranges; 2962 2963 /* +1 in case of mbcset->nranges is 0. */ 2964 new_nranges = 2 * mbcset->nranges + 1; 2965 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2966 new_nranges); 2967 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2968 new_nranges); 2969 2970 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2971 return REG_ESPACE; 2972 2973 mbcset->range_starts = new_array_start; 2974 mbcset->range_ends = new_array_end; 2975 *range_alloc = new_nranges; 2976 } 2977 2978 mbcset->range_starts[mbcset->nranges] = start_collseq; 2979 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2980 } 2981 2982 /* Build the table for single byte characters. */ 2983 for (ch = 0; ch < SBC_MAX; ch++) 2984 { 2985 uint32_t ch_collseq; 2986 /* 2987 if (MB_CUR_MAX == 1) 2988 */ 2989 if (nrules == 0) 2990 ch_collseq = collseqmb[ch]; 2991 else 2992 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2993 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2994 bitset_set (sbcset, ch); 2995 } 2996 return REG_NOERROR; 2997 } 2998 2999 /* Local function for parse_bracket_exp used in _LIBC environement. 3000 Build the collating element which is represented by NAME. 3001 The result are written to MBCSET and SBCSET. 3002 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 3003 pointer argument sinse we may update it. */ 3004 3005 auto inline reg_errcode_t 3006 __attribute ((always_inline)) 3007 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 3008 re_charset_t *mbcset; 3009 int *coll_sym_alloc; 3010 re_bitset_ptr_t sbcset; 3011 const unsigned char *name; 3012 { 3013 int32_t elem, idx; 3014 size_t name_len = strlen ((const char *) name); 3015 if (nrules != 0) 3016 { 3017 elem = seek_collating_symbol_entry (name, name_len); 3018 if (symb_table[2 * elem] != 0) 3019 { 3020 /* We found the entry. */ 3021 idx = symb_table[2 * elem + 1]; 3022 /* Skip the name of collating element name. */ 3023 idx += 1 + extra[idx]; 3024 } 3025 else if (symb_table[2 * elem] == 0 && name_len == 1) 3026 { 3027 /* No valid character, treat it as a normal 3028 character. */ 3029 bitset_set (sbcset, name[0]); 3030 return REG_NOERROR; 3031 } 3032 else 3033 return REG_ECOLLATE; 3034 3035 /* Got valid collation sequence, add it as a new entry. */ 3036 /* Check the space of the arrays. */ 3037 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3038 { 3039 /* Not enough, realloc it. */ 3040 /* +1 in case of mbcset->ncoll_syms is 0. */ 3041 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3042 /* Use realloc since mbcset->coll_syms is NULL 3043 if *alloc == 0. */ 3044 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3045 new_coll_sym_alloc); 3046 if (BE (new_coll_syms == NULL, 0)) 3047 return REG_ESPACE; 3048 mbcset->coll_syms = new_coll_syms; 3049 *coll_sym_alloc = new_coll_sym_alloc; 3050 } 3051 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3052 return REG_NOERROR; 3053 } 3054 else 3055 { 3056 if (BE (name_len != 1, 0)) 3057 return REG_ECOLLATE; 3058 else 3059 { 3060 bitset_set (sbcset, name[0]); 3061 return REG_NOERROR; 3062 } 3063 } 3064 } 3065#endif 3066 3067 re_token_t br_token; 3068 re_bitset_ptr_t sbcset; 3069#ifdef RE_ENABLE_I18N 3070 re_charset_t *mbcset; 3071 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3072 int equiv_class_alloc = 0, char_class_alloc = 0; 3073#endif /* not RE_ENABLE_I18N */ 3074 int non_match = 0; 3075 bin_tree_t *work_tree; 3076 int token_len; 3077 int first_round = 1; 3078#ifdef _LIBC 3079 collseqmb = (const unsigned char *) 3080 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3081 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3082 if (nrules) 3083 { 3084 /* 3085 if (MB_CUR_MAX > 1) 3086 */ 3087 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3088 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3089 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3090 _NL_COLLATE_SYMB_TABLEMB); 3091 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3092 _NL_COLLATE_SYMB_EXTRAMB); 3093 } 3094#endif 3095 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); 3096#ifdef RE_ENABLE_I18N 3097 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3098#endif /* RE_ENABLE_I18N */ 3099#ifdef RE_ENABLE_I18N 3100 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3101#else 3102 if (BE (sbcset == NULL, 0)) 3103#endif /* RE_ENABLE_I18N */ 3104 { 3105 *err = REG_ESPACE; 3106 return NULL; 3107 } 3108 3109 token_len = peek_token_bracket (token, regexp, syntax); 3110 if (BE (token->type == END_OF_RE, 0)) 3111 { 3112 *err = REG_BADPAT; 3113 goto parse_bracket_exp_free_return; 3114 } 3115 if (token->type == OP_NON_MATCH_LIST) 3116 { 3117#ifdef RE_ENABLE_I18N 3118 mbcset->non_match = 1; 3119#endif /* not RE_ENABLE_I18N */ 3120 non_match = 1; 3121 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3122 bitset_set (sbcset, '\0'); 3123 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3124 token_len = peek_token_bracket (token, regexp, syntax); 3125 if (BE (token->type == END_OF_RE, 0)) 3126 { 3127 *err = REG_BADPAT; 3128 goto parse_bracket_exp_free_return; 3129 } 3130 } 3131 3132 /* We treat the first ']' as a normal character. */ 3133 if (token->type == OP_CLOSE_BRACKET) 3134 token->type = CHARACTER; 3135 3136 while (1) 3137 { 3138 bracket_elem_t start_elem, end_elem; 3139 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3140 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3141 reg_errcode_t ret; 3142 int token_len2 = 0, is_range_exp = 0; 3143 re_token_t token2; 3144 3145 start_elem.opr.name = start_name_buf; 3146 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3147 syntax, first_round); 3148 if (BE (ret != REG_NOERROR, 0)) 3149 { 3150 *err = ret; 3151 goto parse_bracket_exp_free_return; 3152 } 3153 first_round = 0; 3154 3155 /* Get information about the next token. We need it in any case. */ 3156 token_len = peek_token_bracket (token, regexp, syntax); 3157 3158 /* Do not check for ranges if we know they are not allowed. */ 3159 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3160 { 3161 if (BE (token->type == END_OF_RE, 0)) 3162 { 3163 *err = REG_EBRACK; 3164 goto parse_bracket_exp_free_return; 3165 } 3166 if (token->type == OP_CHARSET_RANGE) 3167 { 3168 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3169 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3170 if (BE (token2.type == END_OF_RE, 0)) 3171 { 3172 *err = REG_EBRACK; 3173 goto parse_bracket_exp_free_return; 3174 } 3175 if (token2.type == OP_CLOSE_BRACKET) 3176 { 3177 /* We treat the last '-' as a normal character. */ 3178 re_string_skip_bytes (regexp, -token_len); 3179 token->type = CHARACTER; 3180 } 3181 else 3182 is_range_exp = 1; 3183 } 3184 } 3185 3186 if (is_range_exp == 1) 3187 { 3188 end_elem.opr.name = end_name_buf; 3189 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3190 dfa, syntax, 1); 3191 if (BE (ret != REG_NOERROR, 0)) 3192 { 3193 *err = ret; 3194 goto parse_bracket_exp_free_return; 3195 } 3196 3197 token_len = peek_token_bracket (token, regexp, syntax); 3198 3199#ifdef _LIBC 3200 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3201 &start_elem, &end_elem); 3202#else 3203# ifdef RE_ENABLE_I18N 3204 *err = build_range_exp (sbcset, 3205 dfa->mb_cur_max > 1 ? mbcset : NULL, 3206 &range_alloc, &start_elem, &end_elem); 3207# else 3208 *err = build_range_exp (sbcset, &start_elem, &end_elem); 3209# endif 3210#endif /* RE_ENABLE_I18N */ 3211 if (BE (*err != REG_NOERROR, 0)) 3212 goto parse_bracket_exp_free_return; 3213 } 3214 else 3215 { 3216 switch (start_elem.type) 3217 { 3218 case SB_CHAR: 3219 bitset_set (sbcset, start_elem.opr.ch); 3220 break; 3221#ifdef RE_ENABLE_I18N 3222 case MB_CHAR: 3223 /* Check whether the array has enough space. */ 3224 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3225 { 3226 wchar_t *new_mbchars; 3227 /* Not enough, realloc it. */ 3228 /* +1 in case of mbcset->nmbchars is 0. */ 3229 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3230 /* Use realloc since array is NULL if *alloc == 0. */ 3231 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3232 mbchar_alloc); 3233 if (BE (new_mbchars == NULL, 0)) 3234 goto parse_bracket_exp_espace; 3235 mbcset->mbchars = new_mbchars; 3236 } 3237 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3238 break; 3239#endif /* RE_ENABLE_I18N */ 3240 case EQUIV_CLASS: 3241 *err = build_equiv_class (sbcset, 3242#ifdef RE_ENABLE_I18N 3243 mbcset, &equiv_class_alloc, 3244#endif /* RE_ENABLE_I18N */ 3245 start_elem.opr.name); 3246 if (BE (*err != REG_NOERROR, 0)) 3247 goto parse_bracket_exp_free_return; 3248 break; 3249 case COLL_SYM: 3250 *err = build_collating_symbol (sbcset, 3251#ifdef RE_ENABLE_I18N 3252 mbcset, &coll_sym_alloc, 3253#endif /* RE_ENABLE_I18N */ 3254 start_elem.opr.name); 3255 if (BE (*err != REG_NOERROR, 0)) 3256 goto parse_bracket_exp_free_return; 3257 break; 3258 case CHAR_CLASS: 3259 *err = build_charclass (regexp->trans, sbcset, 3260#ifdef RE_ENABLE_I18N 3261 mbcset, &char_class_alloc, 3262#endif /* RE_ENABLE_I18N */ 3263 start_elem.opr.name, syntax); 3264 if (BE (*err != REG_NOERROR, 0)) 3265 goto parse_bracket_exp_free_return; 3266 break; 3267 default: 3268 assert (0); 3269 break; 3270 } 3271 } 3272 if (BE (token->type == END_OF_RE, 0)) 3273 { 3274 *err = REG_EBRACK; 3275 goto parse_bracket_exp_free_return; 3276 } 3277 if (token->type == OP_CLOSE_BRACKET) 3278 break; 3279 } 3280 3281 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3282 3283 /* If it is non-matching list. */ 3284 if (non_match) 3285 bitset_not (sbcset); 3286 3287#ifdef RE_ENABLE_I18N 3288 /* Ensure only single byte characters are set. */ 3289 if (dfa->mb_cur_max > 1) 3290 bitset_mask (sbcset, dfa->sb_char); 3291 3292 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3293 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3294 || mbcset->non_match))) 3295 { 3296 bin_tree_t *mbc_tree; 3297 int sbc_idx; 3298 /* Build a tree for complex bracket. */ 3299 dfa->has_mb_node = 1; 3300 br_token.type = COMPLEX_BRACKET; 3301 br_token.opr.mbcset = mbcset; 3302 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3303 if (BE (mbc_tree == NULL, 0)) 3304 goto parse_bracket_exp_espace; 3305 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx) 3306 if (sbcset[sbc_idx]) 3307 break; 3308 /* If there are no bits set in sbcset, there is no point 3309 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3310 if (sbc_idx < BITSET_UINTS) 3311 { 3312 /* Build a tree for simple bracket. */ 3313 br_token.type = SIMPLE_BRACKET; 3314 br_token.opr.sbcset = sbcset; 3315 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3316 if (BE (work_tree == NULL, 0)) 3317 goto parse_bracket_exp_espace; 3318 3319 /* Then join them by ALT node. */ 3320 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3321 if (BE (work_tree == NULL, 0)) 3322 goto parse_bracket_exp_espace; 3323 } 3324 else 3325 { 3326 re_free (sbcset); 3327 work_tree = mbc_tree; 3328 } 3329 } 3330 else 3331#endif /* not RE_ENABLE_I18N */ 3332 { 3333#ifdef RE_ENABLE_I18N 3334 free_charset (mbcset); 3335#endif 3336 /* Build a tree for simple bracket. */ 3337 br_token.type = SIMPLE_BRACKET; 3338 br_token.opr.sbcset = sbcset; 3339 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3340 if (BE (work_tree == NULL, 0)) 3341 goto parse_bracket_exp_espace; 3342 } 3343 return work_tree; 3344 3345 parse_bracket_exp_espace: 3346 *err = REG_ESPACE; 3347 parse_bracket_exp_free_return: 3348 re_free (sbcset); 3349#ifdef RE_ENABLE_I18N 3350 free_charset (mbcset); 3351#endif /* RE_ENABLE_I18N */ 3352 return NULL; 3353} 3354 3355/* Parse an element in the bracket expression. */ 3356 3357static reg_errcode_t 3358parse_bracket_element (elem, regexp, token, token_len, dfa, syntax, 3359 accept_hyphen) 3360 bracket_elem_t *elem; 3361 re_string_t *regexp; 3362 re_token_t *token; 3363 int token_len; 3364 re_dfa_t *dfa; 3365 reg_syntax_t syntax; 3366 int accept_hyphen; 3367{ 3368#ifdef RE_ENABLE_I18N 3369 int cur_char_size; 3370 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3371 if (cur_char_size > 1) 3372 { 3373 elem->type = MB_CHAR; 3374 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3375 re_string_skip_bytes (regexp, cur_char_size); 3376 return REG_NOERROR; 3377 } 3378#endif /* RE_ENABLE_I18N */ 3379 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3380 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3381 || token->type == OP_OPEN_EQUIV_CLASS) 3382 return parse_bracket_symbol (elem, regexp, token); 3383 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3384 { 3385 /* A '-' must only appear as anything but a range indicator before 3386 the closing bracket. Everything else is an error. */ 3387 re_token_t token2; 3388 (void) peek_token_bracket (&token2, regexp, syntax); 3389 if (token2.type != OP_CLOSE_BRACKET) 3390 /* The actual error value is not standardized since this whole 3391 case is undefined. But ERANGE makes good sense. */ 3392 return REG_ERANGE; 3393 } 3394 elem->type = SB_CHAR; 3395 elem->opr.ch = token->opr.c; 3396 return REG_NOERROR; 3397} 3398 3399/* Parse a bracket symbol in the bracket expression. Bracket symbols are 3400 such as [:<character_class>:], [.<collating_element>.], and 3401 [=<equivalent_class>=]. */ 3402 3403static reg_errcode_t 3404parse_bracket_symbol (elem, regexp, token) 3405 bracket_elem_t *elem; 3406 re_string_t *regexp; 3407 re_token_t *token; 3408{ 3409 unsigned char ch, delim = token->opr.c; 3410 int i = 0; 3411 if (re_string_eoi(regexp)) 3412 return REG_EBRACK; 3413 for (;; ++i) 3414 { 3415 if (i >= BRACKET_NAME_BUF_SIZE) 3416 return REG_EBRACK; 3417 if (token->type == OP_OPEN_CHAR_CLASS) 3418 ch = re_string_fetch_byte_case (regexp); 3419 else 3420 ch = re_string_fetch_byte (regexp); 3421 if (re_string_eoi(regexp)) 3422 return REG_EBRACK; 3423 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3424 break; 3425 elem->opr.name[i] = ch; 3426 } 3427 re_string_skip_bytes (regexp, 1); 3428 elem->opr.name[i] = '\0'; 3429 switch (token->type) 3430 { 3431 case OP_OPEN_COLL_ELEM: 3432 elem->type = COLL_SYM; 3433 break; 3434 case OP_OPEN_EQUIV_CLASS: 3435 elem->type = EQUIV_CLASS; 3436 break; 3437 case OP_OPEN_CHAR_CLASS: 3438 elem->type = CHAR_CLASS; 3439 break; 3440 default: 3441 break; 3442 } 3443 return REG_NOERROR; 3444} 3445 3446 /* Helper function for parse_bracket_exp. 3447 Build the equivalence class which is represented by NAME. 3448 The result are written to MBCSET and SBCSET. 3449 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3450 is a pointer argument sinse we may update it. */ 3451 3452static reg_errcode_t 3453#ifdef RE_ENABLE_I18N 3454build_equiv_class (sbcset, mbcset, equiv_class_alloc, name) 3455 re_charset_t *mbcset; 3456 int *equiv_class_alloc; 3457#else /* not RE_ENABLE_I18N */ 3458build_equiv_class (sbcset, name) 3459#endif /* not RE_ENABLE_I18N */ 3460 re_bitset_ptr_t sbcset; 3461 const unsigned char *name; 3462{ 3463#if defined _LIBC 3464 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3465 if (nrules != 0) 3466 { 3467 const int32_t *table, *indirect; 3468 const unsigned char *weights, *extra, *cp; 3469 unsigned char char_buf[2]; 3470 int32_t idx1, idx2; 3471 unsigned int ch; 3472 size_t len; 3473 /* This #include defines a local function! */ 3474# include <locale/weight.h> 3475 /* Calculate the index for equivalence class. */ 3476 cp = name; 3477 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3478 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3479 _NL_COLLATE_WEIGHTMB); 3480 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3481 _NL_COLLATE_EXTRAMB); 3482 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3483 _NL_COLLATE_INDIRECTMB); 3484 idx1 = findidx (&cp); 3485 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3486 /* This isn't a valid character. */ 3487 return REG_ECOLLATE; 3488 3489 /* Build single byte matcing table for this equivalence class. */ 3490 char_buf[1] = (unsigned char) '\0'; 3491 len = weights[idx1]; 3492 for (ch = 0; ch < SBC_MAX; ++ch) 3493 { 3494 char_buf[0] = ch; 3495 cp = char_buf; 3496 idx2 = findidx (&cp); 3497/* 3498 idx2 = table[ch]; 3499*/ 3500 if (idx2 == 0) 3501 /* This isn't a valid character. */ 3502 continue; 3503 if (len == weights[idx2]) 3504 { 3505 int cnt = 0; 3506 while (cnt <= len && 3507 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) 3508 ++cnt; 3509 3510 if (cnt > len) 3511 bitset_set (sbcset, ch); 3512 } 3513 } 3514 /* Check whether the array has enough space. */ 3515 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3516 { 3517 /* Not enough, realloc it. */ 3518 /* +1 in case of mbcset->nequiv_classes is 0. */ 3519 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3520 /* Use realloc since the array is NULL if *alloc == 0. */ 3521 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3522 int32_t, 3523 new_equiv_class_alloc); 3524 if (BE (new_equiv_classes == NULL, 0)) 3525 return REG_ESPACE; 3526 mbcset->equiv_classes = new_equiv_classes; 3527 *equiv_class_alloc = new_equiv_class_alloc; 3528 } 3529 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3530 } 3531 else 3532#endif /* _LIBC */ 3533 { 3534 if (BE (strlen ((const char *) name) != 1, 0)) 3535 return REG_ECOLLATE; 3536 bitset_set (sbcset, *name); 3537 } 3538 return REG_NOERROR; 3539} 3540 3541 /* Helper function for parse_bracket_exp. 3542 Build the character class which is represented by NAME. 3543 The result are written to MBCSET and SBCSET. 3544 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3545 is a pointer argument sinse we may update it. */ 3546 3547static reg_errcode_t 3548#ifdef RE_ENABLE_I18N 3549build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax) 3550 re_charset_t *mbcset; 3551 int *char_class_alloc; 3552#else /* not RE_ENABLE_I18N */ 3553build_charclass (trans, sbcset, class_name, syntax) 3554#endif /* not RE_ENABLE_I18N */ 3555 unsigned RE_TRANSLATE_TYPE trans; 3556 re_bitset_ptr_t sbcset; 3557 const unsigned char *class_name; 3558 reg_syntax_t syntax; 3559{ 3560 int i; 3561 const char *name = (const char *) class_name; 3562 3563 /* In case of REG_ICASE "upper" and "lower" match the both of 3564 upper and lower cases. */ 3565 if ((syntax & RE_ICASE) 3566 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3567 name = "alpha"; 3568 3569#ifdef RE_ENABLE_I18N 3570 /* Check the space of the arrays. */ 3571 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3572 { 3573 /* Not enough, realloc it. */ 3574 /* +1 in case of mbcset->nchar_classes is 0. */ 3575 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3576 /* Use realloc since array is NULL if *alloc == 0. */ 3577 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3578 new_char_class_alloc); 3579 if (BE (new_char_classes == NULL, 0)) 3580 return REG_ESPACE; 3581 mbcset->char_classes = new_char_classes; 3582 *char_class_alloc = new_char_class_alloc; 3583 } 3584 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3585#endif /* RE_ENABLE_I18N */ 3586 3587#define BUILD_CHARCLASS_LOOP(ctype_func) \ 3588 for (i = 0; i < SBC_MAX; ++i) \ 3589 { \ 3590 if (ctype_func (i)) \ 3591 { \ 3592 int ch = trans ? trans[i] : i; \ 3593 bitset_set (sbcset, ch); \ 3594 } \ 3595 } 3596 3597 if (strcmp (name, "alnum") == 0) 3598 BUILD_CHARCLASS_LOOP (isalnum) 3599 else if (strcmp (name, "cntrl") == 0) 3600 BUILD_CHARCLASS_LOOP (iscntrl) 3601 else if (strcmp (name, "lower") == 0) 3602 BUILD_CHARCLASS_LOOP (islower) 3603 else if (strcmp (name, "space") == 0) 3604 BUILD_CHARCLASS_LOOP (isspace) 3605 else if (strcmp (name, "alpha") == 0) 3606 BUILD_CHARCLASS_LOOP (isalpha) 3607 else if (strcmp (name, "digit") == 0) 3608 BUILD_CHARCLASS_LOOP (isdigit) 3609 else if (strcmp (name, "print") == 0) 3610 BUILD_CHARCLASS_LOOP (isprint) 3611 else if (strcmp (name, "upper") == 0) 3612 BUILD_CHARCLASS_LOOP (isupper) 3613 else if (strcmp (name, "blank") == 0) 3614 BUILD_CHARCLASS_LOOP (isblank) 3615 else if (strcmp (name, "graph") == 0) 3616 BUILD_CHARCLASS_LOOP (isgraph) 3617 else if (strcmp (name, "punct") == 0) 3618 BUILD_CHARCLASS_LOOP (ispunct) 3619 else if (strcmp (name, "xdigit") == 0) 3620 BUILD_CHARCLASS_LOOP (isxdigit) 3621 else 3622 return REG_ECTYPE; 3623 3624 return REG_NOERROR; 3625} 3626 3627static bin_tree_t * 3628build_charclass_op (dfa, trans, class_name, extra, non_match, err) 3629 re_dfa_t *dfa; 3630 unsigned RE_TRANSLATE_TYPE trans; 3631 const unsigned char *class_name; 3632 const unsigned char *extra; 3633 int non_match; 3634 reg_errcode_t *err; 3635{ 3636 re_bitset_ptr_t sbcset; 3637#ifdef RE_ENABLE_I18N 3638 re_charset_t *mbcset; 3639 int alloc = 0; 3640#endif /* not RE_ENABLE_I18N */ 3641 reg_errcode_t ret; 3642 re_token_t br_token; 3643 bin_tree_t *tree; 3644 3645 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); 3646#ifdef RE_ENABLE_I18N 3647 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3648#endif /* RE_ENABLE_I18N */ 3649 3650#ifdef RE_ENABLE_I18N 3651 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3652#else /* not RE_ENABLE_I18N */ 3653 if (BE (sbcset == NULL, 0)) 3654#endif /* not RE_ENABLE_I18N */ 3655 { 3656 *err = REG_ESPACE; 3657 return NULL; 3658 } 3659 3660 if (non_match) 3661 { 3662#ifdef RE_ENABLE_I18N 3663 /* 3664 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3665 bitset_set(cset->sbcset, '\0'); 3666 */ 3667 mbcset->non_match = 1; 3668#endif /* not RE_ENABLE_I18N */ 3669 } 3670 3671 /* We don't care the syntax in this case. */ 3672 ret = build_charclass (trans, sbcset, 3673#ifdef RE_ENABLE_I18N 3674 mbcset, &alloc, 3675#endif /* RE_ENABLE_I18N */ 3676 class_name, 0); 3677 3678 if (BE (ret != REG_NOERROR, 0)) 3679 { 3680 re_free (sbcset); 3681#ifdef RE_ENABLE_I18N 3682 free_charset (mbcset); 3683#endif /* RE_ENABLE_I18N */ 3684 *err = ret; 3685 return NULL; 3686 } 3687 /* \w match '_' also. */ 3688 for (; *extra; extra++) 3689 bitset_set (sbcset, *extra); 3690 3691 /* If it is non-matching list. */ 3692 if (non_match) 3693 bitset_not (sbcset); 3694 3695#ifdef RE_ENABLE_I18N 3696 /* Ensure only single byte characters are set. */ 3697 if (dfa->mb_cur_max > 1) 3698 bitset_mask (sbcset, dfa->sb_char); 3699#endif 3700 3701 /* Build a tree for simple bracket. */ 3702 br_token.type = SIMPLE_BRACKET; 3703 br_token.opr.sbcset = sbcset; 3704 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3705 if (BE (tree == NULL, 0)) 3706 goto build_word_op_espace; 3707 3708#ifdef RE_ENABLE_I18N 3709 if (dfa->mb_cur_max > 1) 3710 { 3711 bin_tree_t *mbc_tree; 3712 /* Build a tree for complex bracket. */ 3713 br_token.type = COMPLEX_BRACKET; 3714 br_token.opr.mbcset = mbcset; 3715 dfa->has_mb_node = 1; 3716 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3717 if (BE (mbc_tree == NULL, 0)) 3718 goto build_word_op_espace; 3719 /* Then join them by ALT node. */ 3720 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3721 if (BE (mbc_tree != NULL, 1)) 3722 return tree; 3723 } 3724 else 3725 { 3726 free_charset (mbcset); 3727 return tree; 3728 } 3729#else /* not RE_ENABLE_I18N */ 3730 return tree; 3731#endif /* not RE_ENABLE_I18N */ 3732 3733 build_word_op_espace: 3734 re_free (sbcset); 3735#ifdef RE_ENABLE_I18N 3736 free_charset (mbcset); 3737#endif /* RE_ENABLE_I18N */ 3738 *err = REG_ESPACE; 3739 return NULL; 3740} 3741 3742/* This is intended for the expressions like "a{1,3}". 3743 Fetch a number from `input', and return the number. 3744 Return -1, if the number field is empty like "{,1}". 3745 Return -2, If an error is occured. */ 3746 3747static int 3748fetch_number (input, token, syntax) 3749 re_string_t *input; 3750 re_token_t *token; 3751 reg_syntax_t syntax; 3752{ 3753 int num = -1; 3754 unsigned char c; 3755 while (1) 3756 { 3757 fetch_token (token, input, syntax); 3758 c = token->opr.c; 3759 if (BE (token->type == END_OF_RE, 0)) 3760 return -2; 3761 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3762 break; 3763 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) 3764 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0')); 3765 num = (num > RE_DUP_MAX) ? -2 : num; 3766 } 3767 return num; 3768} 3769 3770#ifdef RE_ENABLE_I18N 3771static void 3772free_charset (re_charset_t *cset) 3773{ 3774 re_free (cset->mbchars); 3775# ifdef _LIBC 3776 re_free (cset->coll_syms); 3777 re_free (cset->equiv_classes); 3778 re_free (cset->range_starts); 3779 re_free (cset->range_ends); 3780# endif 3781 re_free (cset->char_classes); 3782 re_free (cset); 3783} 3784#endif /* RE_ENABLE_I18N */ 3785 3786/* Functions for binary tree operation. */ 3787 3788/* Create a tree node. */ 3789 3790static bin_tree_t * 3791create_tree (dfa, left, right, type) 3792 re_dfa_t *dfa; 3793 bin_tree_t *left; 3794 bin_tree_t *right; 3795 re_token_type_t type; 3796{ 3797 re_token_t t; 3798 t.type = type; 3799 return create_token_tree (dfa, left, right, &t); 3800} 3801 3802static bin_tree_t * 3803create_token_tree (dfa, left, right, token) 3804 re_dfa_t *dfa; 3805 bin_tree_t *left; 3806 bin_tree_t *right; 3807 const re_token_t *token; 3808{ 3809 bin_tree_t *tree; 3810 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3811 { 3812 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3813 3814 if (storage == NULL) 3815 return NULL; 3816 storage->next = dfa->str_tree_storage; 3817 dfa->str_tree_storage = storage; 3818 dfa->str_tree_storage_idx = 0; 3819 } 3820 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3821 3822 tree->parent = NULL; 3823 tree->left = left; 3824 tree->right = right; 3825 tree->token = *token; 3826 tree->token.duplicated = 0; 3827 tree->token.opt_subexp = 0; 3828 tree->first = NULL; 3829 tree->next = NULL; 3830 tree->node_idx = -1; 3831 3832 if (left != NULL) 3833 left->parent = tree; 3834 if (right != NULL) 3835 right->parent = tree; 3836 return tree; 3837} 3838 3839/* Mark the tree SRC as an optional subexpression. 3840 To be called from preorder or postorder. */ 3841 3842static reg_errcode_t 3843mark_opt_subexp (extra, node) 3844 void *extra; 3845 bin_tree_t *node; 3846{ 3847 int idx = (int) (long) extra; 3848 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3849 node->token.opt_subexp = 1; 3850 3851 return REG_NOERROR; 3852} 3853 3854/* Free the allocated memory inside NODE. */ 3855 3856static void 3857free_token (re_token_t *node) 3858{ 3859#ifdef RE_ENABLE_I18N 3860 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3861 free_charset (node->opr.mbcset); 3862 else 3863#endif /* RE_ENABLE_I18N */ 3864 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3865 re_free (node->opr.sbcset); 3866} 3867 3868/* Worker function for tree walking. Free the allocated memory inside NODE 3869 and its children. */ 3870 3871static reg_errcode_t 3872free_tree (void *extra, bin_tree_t *node) 3873{ 3874 free_token (&node->token); 3875 return REG_NOERROR; 3876} 3877 3878 3879/* Duplicate the node SRC, and return new node. This is a preorder 3880 visit similar to the one implemented by the generic visitor, but 3881 we need more infrastructure to maintain two parallel trees --- so, 3882 it's easier to duplicate. */ 3883 3884static bin_tree_t * 3885duplicate_tree (root, dfa) 3886 const bin_tree_t *root; 3887 re_dfa_t *dfa; 3888{ 3889 const bin_tree_t *node; 3890 bin_tree_t *dup_root; 3891 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3892 3893 for (node = root; ; ) 3894 { 3895 /* Create a new tree and link it back to the current parent. */ 3896 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3897 if (*p_new == NULL) 3898 return NULL; 3899 (*p_new)->parent = dup_node; 3900 (*p_new)->token.duplicated = 1; 3901 dup_node = *p_new; 3902 3903 /* Go to the left node, or up and to the right. */ 3904 if (node->left) 3905 { 3906 node = node->left; 3907 p_new = &dup_node->left; 3908 } 3909 else 3910 { 3911 const bin_tree_t *prev = NULL; 3912 while (node->right == prev || node->right == NULL) 3913 { 3914 prev = node; 3915 node = node->parent; 3916 dup_node = dup_node->parent; 3917 if (!node) 3918 return dup_root; 3919 } 3920 node = node->right; 3921 p_new = &dup_node->right; 3922 } 3923 } 3924} 3925