1/* Extended regular expression matching and search library. 2 Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <http://www.gnu.org/licenses/>. */ 19 20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 21 size_t length, reg_syntax_t syntax); 22static void re_compile_fastmap_iter (regex_t *bufp, 23 const re_dfastate_t *init_state, 24 char *fastmap); 25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 26#ifdef RE_ENABLE_I18N 27static void free_charset (re_charset_t *cset); 28#endif /* RE_ENABLE_I18N */ 29static void free_workarea_compile (regex_t *preg); 30static reg_errcode_t create_initial_state (re_dfa_t *dfa); 31#ifdef RE_ENABLE_I18N 32static void optimize_utf8 (re_dfa_t *dfa); 33#endif 34static reg_errcode_t analyze (regex_t *preg); 35static reg_errcode_t preorder (bin_tree_t *root, 36 reg_errcode_t (fn (void *, bin_tree_t *)), 37 void *extra); 38static reg_errcode_t postorder (bin_tree_t *root, 39 reg_errcode_t (fn (void *, bin_tree_t *)), 40 void *extra); 41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 44 bin_tree_t *node); 45static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 46static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 48static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint); 49static int search_duplicated_node (const re_dfa_t *dfa, int org_node, 50 unsigned int constraint); 51static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 53 int node, int root); 54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 55static int fetch_number (re_string_t *input, re_token_t *token, 56 reg_syntax_t syntax); 57static int peek_token (re_token_t *token, re_string_t *input, 58 reg_syntax_t syntax) internal_function; 59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 60 reg_syntax_t syntax, reg_errcode_t *err); 61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 62 re_token_t *token, reg_syntax_t syntax, 63 int nest, reg_errcode_t *err); 64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 65 re_token_t *token, reg_syntax_t syntax, 66 int nest, reg_errcode_t *err); 67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 68 re_token_t *token, reg_syntax_t syntax, 69 int nest, reg_errcode_t *err); 70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 71 re_token_t *token, reg_syntax_t syntax, 72 int nest, reg_errcode_t *err); 73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 74 re_dfa_t *dfa, re_token_t *token, 75 reg_syntax_t syntax, reg_errcode_t *err); 76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 77 re_token_t *token, reg_syntax_t syntax, 78 reg_errcode_t *err); 79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 80 re_string_t *regexp, 81 re_token_t *token, int token_len, 82 re_dfa_t *dfa, 83 reg_syntax_t syntax, 84 int accept_hyphen); 85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 86 re_string_t *regexp, 87 re_token_t *token); 88#ifdef RE_ENABLE_I18N 89static reg_errcode_t build_equiv_class (bitset_t sbcset, 90 re_charset_t *mbcset, 91 int *equiv_class_alloc, 92 const unsigned char *name); 93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 94 bitset_t sbcset, 95 re_charset_t *mbcset, 96 int *char_class_alloc, 97 const unsigned char *class_name, 98 reg_syntax_t syntax); 99#else /* not RE_ENABLE_I18N */ 100static reg_errcode_t build_equiv_class (bitset_t sbcset, 101 const unsigned char *name); 102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 103 bitset_t sbcset, 104 const unsigned char *class_name, 105 reg_syntax_t syntax); 106#endif /* not RE_ENABLE_I18N */ 107static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 108 RE_TRANSLATE_TYPE trans, 109 const unsigned char *class_name, 110 const unsigned char *extra, 111 int non_match, reg_errcode_t *err); 112static bin_tree_t *create_tree (re_dfa_t *dfa, 113 bin_tree_t *left, bin_tree_t *right, 114 re_token_type_t type); 115static bin_tree_t *create_token_tree (re_dfa_t *dfa, 116 bin_tree_t *left, bin_tree_t *right, 117 const re_token_t *token); 118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 119static void free_token (re_token_t *node); 120static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 122 123/* This table gives an error message for each of the error codes listed 124 in regex.h. Obviously the order here has to be same as there. 125 POSIX doesn't require that we do anything for REG_NOERROR, 126 but why not be nice? */ 127 128const char __re_error_msgid[] attribute_hidden = 129 { 130#define REG_NOERROR_IDX 0 131 gettext_noop ("Success") /* REG_NOERROR */ 132 "\0" 133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 134 gettext_noop ("No match") /* REG_NOMATCH */ 135 "\0" 136#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 138 "\0" 139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 141 "\0" 142#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 144 "\0" 145#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 147 "\0" 148#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 150 "\0" 151#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 153 "\0" 154#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 156 "\0" 157#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 159 "\0" 160#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 162 "\0" 163#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 164 gettext_noop ("Invalid range end") /* REG_ERANGE */ 165 "\0" 166#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 167 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 168 "\0" 169#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 171 "\0" 172#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 173 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 174 "\0" 175#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 176 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 177 "\0" 178#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 180 }; 181 182const size_t __re_error_msgid_idx[] attribute_hidden = 183 { 184 REG_NOERROR_IDX, 185 REG_NOMATCH_IDX, 186 REG_BADPAT_IDX, 187 REG_ECOLLATE_IDX, 188 REG_ECTYPE_IDX, 189 REG_EESCAPE_IDX, 190 REG_ESUBREG_IDX, 191 REG_EBRACK_IDX, 192 REG_EPAREN_IDX, 193 REG_EBRACE_IDX, 194 REG_BADBR_IDX, 195 REG_ERANGE_IDX, 196 REG_ESPACE_IDX, 197 REG_BADRPT_IDX, 198 REG_EEND_IDX, 199 REG_ESIZE_IDX, 200 REG_ERPAREN_IDX 201 }; 202 203/* Entry points for GNU code. */ 204 205/* re_compile_pattern is the GNU regular expression compiler: it 206 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 207 Returns 0 if the pattern was valid, otherwise an error string. 208 209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 210 are set in BUFP on entry. */ 211 212const char * 213re_compile_pattern (pattern, length, bufp) 214 const char *pattern; 215 size_t length; 216 struct re_pattern_buffer *bufp; 217{ 218 reg_errcode_t ret; 219 220 /* And GNU code determines whether or not to get register information 221 by passing null for the REGS argument to re_match, etc., not by 222 setting no_sub, unless RE_NO_SUB is set. */ 223 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 224 225 /* Match anchors at newline. */ 226 bufp->newline_anchor = 1; 227 228 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 229 230 if (!ret) 231 return NULL; 232 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 233} 234#ifdef _LIBC 235weak_alias (__re_compile_pattern, re_compile_pattern) 236#endif 237 238/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 239 also be assigned to arbitrarily: each pattern buffer stores its own 240 syntax, so it can be changed between regex compilations. */ 241/* This has no initializer because initialized variables in Emacs 242 become read-only after dumping. */ 243reg_syntax_t re_syntax_options; 244 245 246/* Specify the precise syntax of regexps for compilation. This provides 247 for compatibility for various utilities which historically have 248 different, incompatible syntaxes. 249 250 The argument SYNTAX is a bit mask comprised of the various bits 251 defined in regex.h. We return the old syntax. */ 252 253reg_syntax_t 254re_set_syntax (syntax) 255 reg_syntax_t syntax; 256{ 257 reg_syntax_t ret = re_syntax_options; 258 259 re_syntax_options = syntax; 260 return ret; 261} 262#ifdef _LIBC 263weak_alias (__re_set_syntax, re_set_syntax) 264#endif 265 266int 267re_compile_fastmap (bufp) 268 struct re_pattern_buffer *bufp; 269{ 270 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 271 char *fastmap = bufp->fastmap; 272 273 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 275 if (dfa->init_state != dfa->init_state_word) 276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 277 if (dfa->init_state != dfa->init_state_nl) 278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 279 if (dfa->init_state != dfa->init_state_begbuf) 280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 281 bufp->fastmap_accurate = 1; 282 return 0; 283} 284#ifdef _LIBC 285weak_alias (__re_compile_fastmap, re_compile_fastmap) 286#endif 287 288static inline void 289__attribute ((always_inline)) 290re_set_fastmap (char *fastmap, int icase, int ch) 291{ 292 fastmap[ch] = 1; 293 if (icase) 294 fastmap[tolower (ch)] = 1; 295} 296 297/* Helper function for re_compile_fastmap. 298 Compile fastmap for the initial_state INIT_STATE. */ 299 300static void 301re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 302 char *fastmap) 303{ 304 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 305 int node_cnt; 306 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 308 { 309 int node = init_state->nodes.elems[node_cnt]; 310 re_token_type_t type = dfa->nodes[node].type; 311 312 if (type == CHARACTER) 313 { 314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 315#ifdef RE_ENABLE_I18N 316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 317 { 318 unsigned char *buf = alloca (dfa->mb_cur_max), *p; 319 wchar_t wc; 320 mbstate_t state; 321 322 p = buf; 323 *p++ = dfa->nodes[node].opr.c; 324 while (++node < dfa->nodes_len 325 && dfa->nodes[node].type == CHARACTER 326 && dfa->nodes[node].mb_partial) 327 *p++ = dfa->nodes[node].opr.c; 328 memset (&state, '\0', sizeof (state)); 329 if (__mbrtowc (&wc, (const char *) buf, p - buf, 330 &state) == p - buf 331 && (__wcrtomb ((char *) buf, towlower (wc), &state) 332 != (size_t) -1)) 333 re_set_fastmap (fastmap, 0, buf[0]); 334 } 335#endif 336 } 337 else if (type == SIMPLE_BRACKET) 338 { 339 int i, ch; 340 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 341 { 342 int j; 343 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 344 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 345 if (w & ((bitset_word_t) 1 << j)) 346 re_set_fastmap (fastmap, icase, ch); 347 } 348 } 349#ifdef RE_ENABLE_I18N 350 else if (type == COMPLEX_BRACKET) 351 { 352 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 353 int i; 354 355# ifdef _LIBC 356 /* See if we have to try all bytes which start multiple collation 357 elements. 358 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 359 collation element, and don't catch 'b' since 'b' is 360 the only collation element which starts from 'b' (and 361 it is caught by SIMPLE_BRACKET). */ 362 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 363 && (cset->ncoll_syms || cset->nranges)) 364 { 365 const int32_t *table = (const int32_t *) 366 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 367 for (i = 0; i < SBC_MAX; ++i) 368 if (table[i] < 0) 369 re_set_fastmap (fastmap, icase, i); 370 } 371# endif /* _LIBC */ 372 373 /* See if we have to start the match at all multibyte characters, 374 i.e. where we would not find an invalid sequence. This only 375 applies to multibyte character sets; for single byte character 376 sets, the SIMPLE_BRACKET again suffices. */ 377 if (dfa->mb_cur_max > 1 378 && (cset->nchar_classes || cset->non_match || cset->nranges 379# ifdef _LIBC 380 || cset->nequiv_classes 381# endif /* _LIBC */ 382 )) 383 { 384 unsigned char c = 0; 385 do 386 { 387 mbstate_t mbs; 388 memset (&mbs, 0, sizeof (mbs)); 389 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 390 re_set_fastmap (fastmap, false, (int) c); 391 } 392 while (++c != 0); 393 } 394 395 else 396 { 397 /* ... Else catch all bytes which can start the mbchars. */ 398 for (i = 0; i < cset->nmbchars; ++i) 399 { 400 char buf[256]; 401 mbstate_t state; 402 memset (&state, '\0', sizeof (state)); 403 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 404 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 405 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 406 { 407 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 408 != (size_t) -1) 409 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 410 } 411 } 412 } 413 } 414#endif /* RE_ENABLE_I18N */ 415 else if (type == OP_PERIOD 416#ifdef RE_ENABLE_I18N 417 || type == OP_UTF8_PERIOD 418#endif /* RE_ENABLE_I18N */ 419 || type == END_OF_RE) 420 { 421 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 422 if (type == END_OF_RE) 423 bufp->can_be_null = 1; 424 return; 425 } 426 } 427} 428 429/* Entry point for POSIX code. */ 430/* regcomp takes a regular expression as a string and compiles it. 431 432 PREG is a regex_t *. We do not expect any fields to be initialized, 433 since POSIX says we shouldn't. Thus, we set 434 435 `buffer' to the compiled pattern; 436 `used' to the length of the compiled pattern; 437 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 438 REG_EXTENDED bit in CFLAGS is set; otherwise, to 439 RE_SYNTAX_POSIX_BASIC; 440 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 441 `fastmap' to an allocated space for the fastmap; 442 `fastmap_accurate' to zero; 443 `re_nsub' to the number of subexpressions in PATTERN. 444 445 PATTERN is the address of the pattern string. 446 447 CFLAGS is a series of bits which affect compilation. 448 449 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 450 use POSIX basic syntax. 451 452 If REG_NEWLINE is set, then . and [^...] don't match newline. 453 Also, regexec will try a match beginning after every newline. 454 455 If REG_ICASE is set, then we considers upper- and lowercase 456 versions of letters to be equivalent when matching. 457 458 If REG_NOSUB is set, then when PREG is passed to regexec, that 459 routine will report only success or failure, and nothing about the 460 registers. 461 462 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 463 the return codes and their meanings.) */ 464 465int 466regcomp (preg, pattern, cflags) 467 regex_t *__restrict preg; 468 const char *__restrict pattern; 469 int cflags; 470{ 471 reg_errcode_t ret; 472 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 473 : RE_SYNTAX_POSIX_BASIC); 474 475 preg->buffer = NULL; 476 preg->allocated = 0; 477 preg->used = 0; 478 479 /* Try to allocate space for the fastmap. */ 480 preg->fastmap = re_malloc (char, SBC_MAX); 481 if (BE (preg->fastmap == NULL, 0)) 482 return REG_ESPACE; 483 484 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 485 486 /* If REG_NEWLINE is set, newlines are treated differently. */ 487 if (cflags & REG_NEWLINE) 488 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 489 syntax &= ~RE_DOT_NEWLINE; 490 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 491 /* It also changes the matching behavior. */ 492 preg->newline_anchor = 1; 493 } 494 else 495 preg->newline_anchor = 0; 496 preg->no_sub = !!(cflags & REG_NOSUB); 497 preg->translate = NULL; 498 499 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 500 501 /* POSIX doesn't distinguish between an unmatched open-group and an 502 unmatched close-group: both are REG_EPAREN. */ 503 if (ret == REG_ERPAREN) 504 ret = REG_EPAREN; 505 506 /* We have already checked preg->fastmap != NULL. */ 507 if (BE (ret == REG_NOERROR, 1)) 508 /* Compute the fastmap now, since regexec cannot modify the pattern 509 buffer. This function never fails in this implementation. */ 510 (void) re_compile_fastmap (preg); 511 else 512 { 513 /* Some error occurred while compiling the expression. */ 514 re_free (preg->fastmap); 515 preg->fastmap = NULL; 516 } 517 518 return (int) ret; 519} 520#ifdef _LIBC 521weak_alias (__regcomp, regcomp) 522#endif 523 524/* Returns a message corresponding to an error code, ERRCODE, returned 525 from either regcomp or regexec. We don't use PREG here. */ 526 527size_t 528regerror (errcode, preg, errbuf, errbuf_size) 529 int errcode; 530 const regex_t *__restrict preg; 531 char *__restrict errbuf; 532 size_t errbuf_size; 533{ 534 const char *msg; 535 size_t msg_size; 536 537 if (BE (errcode < 0 538 || errcode >= (int) (sizeof (__re_error_msgid_idx) 539 / sizeof (__re_error_msgid_idx[0])), 0)) 540 /* Only error codes returned by the rest of the code should be passed 541 to this routine. If we are given anything else, or if other regex 542 code generates an invalid error code, then the program has a bug. 543 Dump core so we can fix it. */ 544 abort (); 545 546 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 547 548 msg_size = strlen (msg) + 1; /* Includes the null. */ 549 550 if (BE (errbuf_size != 0, 1)) 551 { 552 if (BE (msg_size > errbuf_size, 0)) 553 { 554#if defined HAVE_MEMPCPY || defined _LIBC 555 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 556#else 557 memcpy (errbuf, msg, errbuf_size - 1); 558 errbuf[errbuf_size - 1] = 0; 559#endif 560 } 561 else 562 memcpy (errbuf, msg, msg_size); 563 } 564 565 return msg_size; 566} 567#ifdef _LIBC 568weak_alias (__regerror, regerror) 569#endif 570 571 572#ifdef RE_ENABLE_I18N 573/* This static array is used for the map to single-byte characters when 574 UTF-8 is used. Otherwise we would allocate memory just to initialize 575 it the same all the time. UTF-8 is the preferred encoding so this is 576 a worthwhile optimization. */ 577static const bitset_t utf8_sb_map = 578{ 579 /* Set the first 128 bits. */ 580 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX 581}; 582#endif 583 584 585static void 586free_dfa_content (re_dfa_t *dfa) 587{ 588 int i, j; 589 590 if (dfa->nodes) 591 for (i = 0; i < dfa->nodes_len; ++i) 592 free_token (dfa->nodes + i); 593 re_free (dfa->nexts); 594 for (i = 0; i < dfa->nodes_len; ++i) 595 { 596 if (dfa->eclosures != NULL) 597 re_node_set_free (dfa->eclosures + i); 598 if (dfa->inveclosures != NULL) 599 re_node_set_free (dfa->inveclosures + i); 600 if (dfa->edests != NULL) 601 re_node_set_free (dfa->edests + i); 602 } 603 re_free (dfa->edests); 604 re_free (dfa->eclosures); 605 re_free (dfa->inveclosures); 606 re_free (dfa->nodes); 607 608 if (dfa->state_table) 609 for (i = 0; i <= dfa->state_hash_mask; ++i) 610 { 611 struct re_state_table_entry *entry = dfa->state_table + i; 612 for (j = 0; j < entry->num; ++j) 613 { 614 re_dfastate_t *state = entry->array[j]; 615 free_state (state); 616 } 617 re_free (entry->array); 618 } 619 re_free (dfa->state_table); 620#ifdef RE_ENABLE_I18N 621 if (dfa->sb_char != utf8_sb_map) 622 re_free (dfa->sb_char); 623#endif 624 re_free (dfa->subexp_map); 625#ifdef DEBUG 626 re_free (dfa->re_str); 627#endif 628 629 re_free (dfa); 630} 631 632 633/* Free dynamically allocated space used by PREG. */ 634 635void 636regfree (preg) 637 regex_t *preg; 638{ 639 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 640 if (BE (dfa != NULL, 1)) 641 free_dfa_content (dfa); 642 preg->buffer = NULL; 643 preg->allocated = 0; 644 645 re_free (preg->fastmap); 646 preg->fastmap = NULL; 647 648 re_free (preg->translate); 649 preg->translate = NULL; 650} 651#ifdef _LIBC 652weak_alias (__regfree, regfree) 653#endif 654 655/* Entry points compatible with 4.2 BSD regex library. We don't define 656 them unless specifically requested. */ 657 658#if defined _REGEX_RE_COMP || defined _LIBC 659 660/* BSD has one and only one pattern buffer. */ 661static struct re_pattern_buffer re_comp_buf; 662 663char * 664# ifdef _LIBC 665/* Make these definitions weak in libc, so POSIX programs can redefine 666 these names if they don't use our functions, and still use 667 regcomp/regexec above without link errors. */ 668weak_function 669# endif 670re_comp (s) 671 const char *s; 672{ 673 reg_errcode_t ret; 674 char *fastmap; 675 676 if (!s) 677 { 678 if (!re_comp_buf.buffer) 679 return gettext ("No previous regular expression"); 680 return 0; 681 } 682 683 if (re_comp_buf.buffer) 684 { 685 fastmap = re_comp_buf.fastmap; 686 re_comp_buf.fastmap = NULL; 687 __regfree (&re_comp_buf); 688 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 689 re_comp_buf.fastmap = fastmap; 690 } 691 692 if (re_comp_buf.fastmap == NULL) 693 { 694 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 695 if (re_comp_buf.fastmap == NULL) 696 return (char *) gettext (__re_error_msgid 697 + __re_error_msgid_idx[(int) REG_ESPACE]); 698 } 699 700 /* Since `re_exec' always passes NULL for the `regs' argument, we 701 don't need to initialize the pattern buffer fields which affect it. */ 702 703 /* Match anchors at newlines. */ 704 re_comp_buf.newline_anchor = 1; 705 706 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 707 708 if (!ret) 709 return NULL; 710 711 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 712 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 713} 714 715#ifdef _LIBC 716libc_freeres_fn (free_mem) 717{ 718 __regfree (&re_comp_buf); 719} 720#endif 721 722#endif /* _REGEX_RE_COMP */ 723 724/* Internal entry point. 725 Compile the regular expression PATTERN, whose length is LENGTH. 726 SYNTAX indicate regular expression's syntax. */ 727 728static reg_errcode_t 729re_compile_internal (regex_t *preg, const char * pattern, size_t length, 730 reg_syntax_t syntax) 731{ 732 reg_errcode_t err = REG_NOERROR; 733 re_dfa_t *dfa; 734 re_string_t regexp; 735 736 /* Initialize the pattern buffer. */ 737 preg->fastmap_accurate = 0; 738 preg->syntax = syntax; 739 preg->not_bol = preg->not_eol = 0; 740 preg->used = 0; 741 preg->re_nsub = 0; 742 preg->can_be_null = 0; 743 preg->regs_allocated = REGS_UNALLOCATED; 744 745 /* Initialize the dfa. */ 746 dfa = (re_dfa_t *) preg->buffer; 747 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 748 { 749 /* If zero allocated, but buffer is non-null, try to realloc 750 enough space. This loses if buffer's address is bogus, but 751 that is the user's responsibility. If ->buffer is NULL this 752 is a simple allocation. */ 753 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 754 if (dfa == NULL) 755 return REG_ESPACE; 756 preg->allocated = sizeof (re_dfa_t); 757 preg->buffer = (unsigned char *) dfa; 758 } 759 preg->used = sizeof (re_dfa_t); 760 761 err = init_dfa (dfa, length); 762 if (BE (err != REG_NOERROR, 0)) 763 { 764 free_dfa_content (dfa); 765 preg->buffer = NULL; 766 preg->allocated = 0; 767 return err; 768 } 769#ifdef DEBUG 770 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 771 dfa->re_str = re_malloc (char, length + 1); 772 strncpy (dfa->re_str, pattern, length + 1); 773#endif 774 775 __libc_lock_init (dfa->lock); 776 777 err = re_string_construct (®exp, pattern, length, preg->translate, 778 syntax & RE_ICASE, dfa); 779 if (BE (err != REG_NOERROR, 0)) 780 { 781 re_compile_internal_free_return: 782 free_workarea_compile (preg); 783 re_string_destruct (®exp); 784 free_dfa_content (dfa); 785 preg->buffer = NULL; 786 preg->allocated = 0; 787 return err; 788 } 789 790 /* Parse the regular expression, and build a structure tree. */ 791 preg->re_nsub = 0; 792 dfa->str_tree = parse (®exp, preg, syntax, &err); 793 if (BE (dfa->str_tree == NULL, 0)) 794 goto re_compile_internal_free_return; 795 796 /* Analyze the tree and create the nfa. */ 797 err = analyze (preg); 798 if (BE (err != REG_NOERROR, 0)) 799 goto re_compile_internal_free_return; 800 801#ifdef RE_ENABLE_I18N 802 /* If possible, do searching in single byte encoding to speed things up. */ 803 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 804 optimize_utf8 (dfa); 805#endif 806 807 /* Then create the initial state of the dfa. */ 808 err = create_initial_state (dfa); 809 810 /* Release work areas. */ 811 free_workarea_compile (preg); 812 re_string_destruct (®exp); 813 814 if (BE (err != REG_NOERROR, 0)) 815 { 816 free_dfa_content (dfa); 817 preg->buffer = NULL; 818 preg->allocated = 0; 819 } 820 821 return err; 822} 823 824/* Initialize DFA. We use the length of the regular expression PAT_LEN 825 as the initial length of some arrays. */ 826 827static reg_errcode_t 828init_dfa (re_dfa_t *dfa, size_t pat_len) 829{ 830 unsigned int table_size; 831#ifndef _LIBC 832 char *codeset_name; 833#endif 834 835 memset (dfa, '\0', sizeof (re_dfa_t)); 836 837 /* Force allocation of str_tree_storage the first time. */ 838 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 839 840 /* Avoid overflows. */ 841 if (pat_len == SIZE_MAX) 842 return REG_ESPACE; 843 844 dfa->nodes_alloc = pat_len + 1; 845 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 846 847 /* table_size = 2 ^ ceil(log pat_len) */ 848 for (table_size = 1; ; table_size <<= 1) 849 if (table_size > pat_len) 850 break; 851 852 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 853 dfa->state_hash_mask = table_size - 1; 854 855 dfa->mb_cur_max = MB_CUR_MAX; 856#ifdef _LIBC 857 if (dfa->mb_cur_max == 6 858 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 859 dfa->is_utf8 = 1; 860 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 861 != 0); 862#else 863# ifdef HAVE_LANGINFO_CODESET 864 codeset_name = nl_langinfo (CODESET); 865# else 866 codeset_name = getenv ("LC_ALL"); 867 if (codeset_name == NULL || codeset_name[0] == '\0') 868 codeset_name = getenv ("LC_CTYPE"); 869 if (codeset_name == NULL || codeset_name[0] == '\0') 870 codeset_name = getenv ("LANG"); 871 if (codeset_name == NULL) 872 codeset_name = ""; 873 else if (strchr (codeset_name, '.') != NULL) 874 codeset_name = strchr (codeset_name, '.') + 1; 875# endif 876 877 if (strcasecmp (codeset_name, "UTF-8") == 0 878 || strcasecmp (codeset_name, "UTF8") == 0) 879 dfa->is_utf8 = 1; 880 881 /* We check exhaustively in the loop below if this charset is a 882 superset of ASCII. */ 883 dfa->map_notascii = 0; 884#endif 885 886#ifdef RE_ENABLE_I18N 887 if (dfa->mb_cur_max > 1) 888 { 889 if (dfa->is_utf8) 890 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 891 else 892 { 893 int i, j, ch; 894 895 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 896 if (BE (dfa->sb_char == NULL, 0)) 897 return REG_ESPACE; 898 899 /* Set the bits corresponding to single byte chars. */ 900 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 901 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 902 { 903 wint_t wch = __btowc (ch); 904 if (wch != WEOF) 905 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 906# ifndef _LIBC 907 if (isascii (ch) && wch != ch) 908 dfa->map_notascii = 1; 909# endif 910 } 911 } 912 } 913#endif 914 915 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 916 return REG_ESPACE; 917 return REG_NOERROR; 918} 919 920/* Initialize WORD_CHAR table, which indicate which character is 921 "word". In this case "word" means that it is the word construction 922 character used by some operators like "\<", "\>", etc. */ 923 924static void 925internal_function 926init_word_char (re_dfa_t *dfa) 927{ 928 dfa->word_ops_used = 1; 929 int i = 0; 930 int ch = 0; 931 if (BE (dfa->map_notascii == 0, 1)) 932 { 933 if (sizeof (dfa->word_char[0]) == 8) 934 { 935 /* The extra temporaries here avoid "implicitly truncated" 936 warnings in the case when this is dead code, i.e. 32-bit. */ 937 const uint64_t wc0 = UINT64_C (0x03ff000000000000); 938 const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe); 939 dfa->word_char[0] = wc0; 940 dfa->word_char[1] = wc1; 941 i = 2; 942 } 943 else if (sizeof (dfa->word_char[0]) == 4) 944 { 945 dfa->word_char[0] = UINT32_C (0x00000000); 946 dfa->word_char[1] = UINT32_C (0x03ff0000); 947 dfa->word_char[2] = UINT32_C (0x87fffffe); 948 dfa->word_char[3] = UINT32_C (0x07fffffe); 949 i = 4; 950 } 951 else 952 abort (); 953 ch = 128; 954 955 if (BE (dfa->is_utf8, 1)) 956 { 957 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); 958 return; 959 } 960 } 961 962 for (; i < BITSET_WORDS; ++i) 963 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 964 if (isalnum (ch) || ch == '_') 965 dfa->word_char[i] |= (bitset_word_t) 1 << j; 966} 967 968/* Free the work area which are only used while compiling. */ 969 970static void 971free_workarea_compile (regex_t *preg) 972{ 973 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 974 bin_tree_storage_t *storage, *next; 975 for (storage = dfa->str_tree_storage; storage; storage = next) 976 { 977 next = storage->next; 978 re_free (storage); 979 } 980 dfa->str_tree_storage = NULL; 981 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 982 dfa->str_tree = NULL; 983 re_free (dfa->org_indices); 984 dfa->org_indices = NULL; 985} 986 987/* Create initial states for all contexts. */ 988 989static reg_errcode_t 990create_initial_state (re_dfa_t *dfa) 991{ 992 int first, i; 993 reg_errcode_t err; 994 re_node_set init_nodes; 995 996 /* Initial states have the epsilon closure of the node which is 997 the first node of the regular expression. */ 998 first = dfa->str_tree->first->node_idx; 999 dfa->init_node = first; 1000 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 1001 if (BE (err != REG_NOERROR, 0)) 1002 return err; 1003 1004 /* The back-references which are in initial states can epsilon transit, 1005 since in this case all of the subexpressions can be null. 1006 Then we add epsilon closures of the nodes which are the next nodes of 1007 the back-references. */ 1008 if (dfa->nbackref > 0) 1009 for (i = 0; i < init_nodes.nelem; ++i) 1010 { 1011 int node_idx = init_nodes.elems[i]; 1012 re_token_type_t type = dfa->nodes[node_idx].type; 1013 1014 int clexp_idx; 1015 if (type != OP_BACK_REF) 1016 continue; 1017 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1018 { 1019 re_token_t *clexp_node; 1020 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1021 if (clexp_node->type == OP_CLOSE_SUBEXP 1022 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1023 break; 1024 } 1025 if (clexp_idx == init_nodes.nelem) 1026 continue; 1027 1028 if (type == OP_BACK_REF) 1029 { 1030 int dest_idx = dfa->edests[node_idx].elems[0]; 1031 if (!re_node_set_contains (&init_nodes, dest_idx)) 1032 { 1033 reg_errcode_t err = re_node_set_merge (&init_nodes, 1034 dfa->eclosures 1035 + dest_idx); 1036 if (err != REG_NOERROR) 1037 return err; 1038 i = 0; 1039 } 1040 } 1041 } 1042 1043 /* It must be the first time to invoke acquire_state. */ 1044 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1045 /* We don't check ERR here, since the initial state must not be NULL. */ 1046 if (BE (dfa->init_state == NULL, 0)) 1047 return err; 1048 if (dfa->init_state->has_constraint) 1049 { 1050 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1051 CONTEXT_WORD); 1052 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1053 CONTEXT_NEWLINE); 1054 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1055 &init_nodes, 1056 CONTEXT_NEWLINE 1057 | CONTEXT_BEGBUF); 1058 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1059 || dfa->init_state_begbuf == NULL, 0)) 1060 return err; 1061 } 1062 else 1063 dfa->init_state_word = dfa->init_state_nl 1064 = dfa->init_state_begbuf = dfa->init_state; 1065 1066 re_node_set_free (&init_nodes); 1067 return REG_NOERROR; 1068} 1069 1070#ifdef RE_ENABLE_I18N 1071/* If it is possible to do searching in single byte encoding instead of UTF-8 1072 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1073 DFA nodes where needed. */ 1074 1075static void 1076optimize_utf8 (re_dfa_t *dfa) 1077{ 1078 int node, i, mb_chars = 0, has_period = 0; 1079 1080 for (node = 0; node < dfa->nodes_len; ++node) 1081 switch (dfa->nodes[node].type) 1082 { 1083 case CHARACTER: 1084 if (dfa->nodes[node].opr.c >= 0x80) 1085 mb_chars = 1; 1086 break; 1087 case ANCHOR: 1088 switch (dfa->nodes[node].opr.ctx_type) 1089 { 1090 case LINE_FIRST: 1091 case LINE_LAST: 1092 case BUF_FIRST: 1093 case BUF_LAST: 1094 break; 1095 default: 1096 /* Word anchors etc. cannot be handled. It's okay to test 1097 opr.ctx_type since constraints (for all DFA nodes) are 1098 created by ORing one or more opr.ctx_type values. */ 1099 return; 1100 } 1101 break; 1102 case OP_PERIOD: 1103 has_period = 1; 1104 break; 1105 case OP_BACK_REF: 1106 case OP_ALT: 1107 case END_OF_RE: 1108 case OP_DUP_ASTERISK: 1109 case OP_OPEN_SUBEXP: 1110 case OP_CLOSE_SUBEXP: 1111 break; 1112 case COMPLEX_BRACKET: 1113 return; 1114 case SIMPLE_BRACKET: 1115 /* Just double check. The non-ASCII range starts at 0x80. */ 1116 assert (0x80 % BITSET_WORD_BITS == 0); 1117 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1118 if (dfa->nodes[node].opr.sbcset[i]) 1119 return; 1120 break; 1121 default: 1122 abort (); 1123 } 1124 1125 if (mb_chars || has_period) 1126 for (node = 0; node < dfa->nodes_len; ++node) 1127 { 1128 if (dfa->nodes[node].type == CHARACTER 1129 && dfa->nodes[node].opr.c >= 0x80) 1130 dfa->nodes[node].mb_partial = 0; 1131 else if (dfa->nodes[node].type == OP_PERIOD) 1132 dfa->nodes[node].type = OP_UTF8_PERIOD; 1133 } 1134 1135 /* The search can be in single byte locale. */ 1136 dfa->mb_cur_max = 1; 1137 dfa->is_utf8 = 0; 1138 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1139} 1140#endif 1141 1142/* Analyze the structure tree, and calculate "first", "next", "edest", 1143 "eclosure", and "inveclosure". */ 1144 1145static reg_errcode_t 1146analyze (regex_t *preg) 1147{ 1148 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1149 reg_errcode_t ret; 1150 1151 /* Allocate arrays. */ 1152 dfa->nexts = re_malloc (int, dfa->nodes_alloc); 1153 dfa->org_indices = re_malloc (int, dfa->nodes_alloc); 1154 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1155 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1156 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1157 || dfa->eclosures == NULL, 0)) 1158 return REG_ESPACE; 1159 1160 dfa->subexp_map = re_malloc (int, preg->re_nsub); 1161 if (dfa->subexp_map != NULL) 1162 { 1163 int i; 1164 for (i = 0; i < preg->re_nsub; i++) 1165 dfa->subexp_map[i] = i; 1166 preorder (dfa->str_tree, optimize_subexps, dfa); 1167 for (i = 0; i < preg->re_nsub; i++) 1168 if (dfa->subexp_map[i] != i) 1169 break; 1170 if (i == preg->re_nsub) 1171 { 1172 free (dfa->subexp_map); 1173 dfa->subexp_map = NULL; 1174 } 1175 } 1176 1177 ret = postorder (dfa->str_tree, lower_subexps, preg); 1178 if (BE (ret != REG_NOERROR, 0)) 1179 return ret; 1180 ret = postorder (dfa->str_tree, calc_first, dfa); 1181 if (BE (ret != REG_NOERROR, 0)) 1182 return ret; 1183 preorder (dfa->str_tree, calc_next, dfa); 1184 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1185 if (BE (ret != REG_NOERROR, 0)) 1186 return ret; 1187 ret = calc_eclosure (dfa); 1188 if (BE (ret != REG_NOERROR, 0)) 1189 return ret; 1190 1191 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1192 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1193 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1194 || dfa->nbackref) 1195 { 1196 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1197 if (BE (dfa->inveclosures == NULL, 0)) 1198 return REG_ESPACE; 1199 ret = calc_inveclosure (dfa); 1200 } 1201 1202 return ret; 1203} 1204 1205/* Our parse trees are very unbalanced, so we cannot use a stack to 1206 implement parse tree visits. Instead, we use parent pointers and 1207 some hairy code in these two functions. */ 1208static reg_errcode_t 1209postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1210 void *extra) 1211{ 1212 bin_tree_t *node, *prev; 1213 1214 for (node = root; ; ) 1215 { 1216 /* Descend down the tree, preferably to the left (or to the right 1217 if that's the only child). */ 1218 while (node->left || node->right) 1219 if (node->left) 1220 node = node->left; 1221 else 1222 node = node->right; 1223 1224 do 1225 { 1226 reg_errcode_t err = fn (extra, node); 1227 if (BE (err != REG_NOERROR, 0)) 1228 return err; 1229 if (node->parent == NULL) 1230 return REG_NOERROR; 1231 prev = node; 1232 node = node->parent; 1233 } 1234 /* Go up while we have a node that is reached from the right. */ 1235 while (node->right == prev || node->right == NULL); 1236 node = node->right; 1237 } 1238} 1239 1240static reg_errcode_t 1241preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1242 void *extra) 1243{ 1244 bin_tree_t *node; 1245 1246 for (node = root; ; ) 1247 { 1248 reg_errcode_t err = fn (extra, node); 1249 if (BE (err != REG_NOERROR, 0)) 1250 return err; 1251 1252 /* Go to the left node, or up and to the right. */ 1253 if (node->left) 1254 node = node->left; 1255 else 1256 { 1257 bin_tree_t *prev = NULL; 1258 while (node->right == prev || node->right == NULL) 1259 { 1260 prev = node; 1261 node = node->parent; 1262 if (!node) 1263 return REG_NOERROR; 1264 } 1265 node = node->right; 1266 } 1267 } 1268} 1269 1270/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1271 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1272 backreferences as well. Requires a preorder visit. */ 1273static reg_errcode_t 1274optimize_subexps (void *extra, bin_tree_t *node) 1275{ 1276 re_dfa_t *dfa = (re_dfa_t *) extra; 1277 1278 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1279 { 1280 int idx = node->token.opr.idx; 1281 node->token.opr.idx = dfa->subexp_map[idx]; 1282 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1283 } 1284 1285 else if (node->token.type == SUBEXP 1286 && node->left && node->left->token.type == SUBEXP) 1287 { 1288 int other_idx = node->left->token.opr.idx; 1289 1290 node->left = node->left->left; 1291 if (node->left) 1292 node->left->parent = node; 1293 1294 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1295 if (other_idx < BITSET_WORD_BITS) 1296 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1297 } 1298 1299 return REG_NOERROR; 1300} 1301 1302/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1303 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1304static reg_errcode_t 1305lower_subexps (void *extra, bin_tree_t *node) 1306{ 1307 regex_t *preg = (regex_t *) extra; 1308 reg_errcode_t err = REG_NOERROR; 1309 1310 if (node->left && node->left->token.type == SUBEXP) 1311 { 1312 node->left = lower_subexp (&err, preg, node->left); 1313 if (node->left) 1314 node->left->parent = node; 1315 } 1316 if (node->right && node->right->token.type == SUBEXP) 1317 { 1318 node->right = lower_subexp (&err, preg, node->right); 1319 if (node->right) 1320 node->right->parent = node; 1321 } 1322 1323 return err; 1324} 1325 1326static bin_tree_t * 1327lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1328{ 1329 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1330 bin_tree_t *body = node->left; 1331 bin_tree_t *op, *cls, *tree1, *tree; 1332 1333 if (preg->no_sub 1334 /* We do not optimize empty subexpressions, because otherwise we may 1335 have bad CONCAT nodes with NULL children. This is obviously not 1336 very common, so we do not lose much. An example that triggers 1337 this case is the sed "script" /\(\)/x. */ 1338 && node->left != NULL 1339 && (node->token.opr.idx >= BITSET_WORD_BITS 1340 || !(dfa->used_bkref_map 1341 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1342 return node->left; 1343 1344 /* Convert the SUBEXP node to the concatenation of an 1345 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1346 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1347 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1348 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1349 tree = create_tree (dfa, op, tree1, CONCAT); 1350 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1351 { 1352 *err = REG_ESPACE; 1353 return NULL; 1354 } 1355 1356 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1357 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1358 return tree; 1359} 1360 1361/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1362 nodes. Requires a postorder visit. */ 1363static reg_errcode_t 1364calc_first (void *extra, bin_tree_t *node) 1365{ 1366 re_dfa_t *dfa = (re_dfa_t *) extra; 1367 if (node->token.type == CONCAT) 1368 { 1369 node->first = node->left->first; 1370 node->node_idx = node->left->node_idx; 1371 } 1372 else 1373 { 1374 node->first = node; 1375 node->node_idx = re_dfa_add_node (dfa, node->token); 1376 if (BE (node->node_idx == -1, 0)) 1377 return REG_ESPACE; 1378 if (node->token.type == ANCHOR) 1379 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1380 } 1381 return REG_NOERROR; 1382} 1383 1384/* Pass 2: compute NEXT on the tree. Preorder visit. */ 1385static reg_errcode_t 1386calc_next (void *extra, bin_tree_t *node) 1387{ 1388 switch (node->token.type) 1389 { 1390 case OP_DUP_ASTERISK: 1391 node->left->next = node; 1392 break; 1393 case CONCAT: 1394 node->left->next = node->right->first; 1395 node->right->next = node->next; 1396 break; 1397 default: 1398 if (node->left) 1399 node->left->next = node->next; 1400 if (node->right) 1401 node->right->next = node->next; 1402 break; 1403 } 1404 return REG_NOERROR; 1405} 1406 1407/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1408static reg_errcode_t 1409link_nfa_nodes (void *extra, bin_tree_t *node) 1410{ 1411 re_dfa_t *dfa = (re_dfa_t *) extra; 1412 int idx = node->node_idx; 1413 reg_errcode_t err = REG_NOERROR; 1414 1415 switch (node->token.type) 1416 { 1417 case CONCAT: 1418 break; 1419 1420 case END_OF_RE: 1421 assert (node->next == NULL); 1422 break; 1423 1424 case OP_DUP_ASTERISK: 1425 case OP_ALT: 1426 { 1427 int left, right; 1428 dfa->has_plural_match = 1; 1429 if (node->left != NULL) 1430 left = node->left->first->node_idx; 1431 else 1432 left = node->next->node_idx; 1433 if (node->right != NULL) 1434 right = node->right->first->node_idx; 1435 else 1436 right = node->next->node_idx; 1437 assert (left > -1); 1438 assert (right > -1); 1439 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1440 } 1441 break; 1442 1443 case ANCHOR: 1444 case OP_OPEN_SUBEXP: 1445 case OP_CLOSE_SUBEXP: 1446 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1447 break; 1448 1449 case OP_BACK_REF: 1450 dfa->nexts[idx] = node->next->node_idx; 1451 if (node->token.type == OP_BACK_REF) 1452 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1453 break; 1454 1455 default: 1456 assert (!IS_EPSILON_NODE (node->token.type)); 1457 dfa->nexts[idx] = node->next->node_idx; 1458 break; 1459 } 1460 1461 return err; 1462} 1463 1464/* Duplicate the epsilon closure of the node ROOT_NODE. 1465 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1466 to their own constraint. */ 1467 1468static reg_errcode_t 1469internal_function 1470duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, 1471 int root_node, unsigned int init_constraint) 1472{ 1473 int org_node, clone_node, ret; 1474 unsigned int constraint = init_constraint; 1475 for (org_node = top_org_node, clone_node = top_clone_node;;) 1476 { 1477 int org_dest, clone_dest; 1478 if (dfa->nodes[org_node].type == OP_BACK_REF) 1479 { 1480 /* If the back reference epsilon-transit, its destination must 1481 also have the constraint. Then duplicate the epsilon closure 1482 of the destination of the back reference, and store it in 1483 edests of the back reference. */ 1484 org_dest = dfa->nexts[org_node]; 1485 re_node_set_empty (dfa->edests + clone_node); 1486 clone_dest = duplicate_node (dfa, org_dest, constraint); 1487 if (BE (clone_dest == -1, 0)) 1488 return REG_ESPACE; 1489 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1490 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1491 if (BE (ret < 0, 0)) 1492 return REG_ESPACE; 1493 } 1494 else if (dfa->edests[org_node].nelem == 0) 1495 { 1496 /* In case of the node can't epsilon-transit, don't duplicate the 1497 destination and store the original destination as the 1498 destination of the node. */ 1499 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1500 break; 1501 } 1502 else if (dfa->edests[org_node].nelem == 1) 1503 { 1504 /* In case of the node can epsilon-transit, and it has only one 1505 destination. */ 1506 org_dest = dfa->edests[org_node].elems[0]; 1507 re_node_set_empty (dfa->edests + clone_node); 1508 /* If the node is root_node itself, it means the epsilon clsoure 1509 has a loop. Then tie it to the destination of the root_node. */ 1510 if (org_node == root_node && clone_node != org_node) 1511 { 1512 ret = re_node_set_insert (dfa->edests + clone_node, org_dest); 1513 if (BE (ret < 0, 0)) 1514 return REG_ESPACE; 1515 break; 1516 } 1517 /* In case of the node has another constraint, add it. */ 1518 constraint |= dfa->nodes[org_node].constraint; 1519 clone_dest = duplicate_node (dfa, org_dest, constraint); 1520 if (BE (clone_dest == -1, 0)) 1521 return REG_ESPACE; 1522 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1523 if (BE (ret < 0, 0)) 1524 return REG_ESPACE; 1525 } 1526 else /* dfa->edests[org_node].nelem == 2 */ 1527 { 1528 /* In case of the node can epsilon-transit, and it has two 1529 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1530 org_dest = dfa->edests[org_node].elems[0]; 1531 re_node_set_empty (dfa->edests + clone_node); 1532 /* Search for a duplicated node which satisfies the constraint. */ 1533 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1534 if (clone_dest == -1) 1535 { 1536 /* There is no such duplicated node, create a new one. */ 1537 reg_errcode_t err; 1538 clone_dest = duplicate_node (dfa, org_dest, constraint); 1539 if (BE (clone_dest == -1, 0)) 1540 return REG_ESPACE; 1541 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1542 if (BE (ret < 0, 0)) 1543 return REG_ESPACE; 1544 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1545 root_node, constraint); 1546 if (BE (err != REG_NOERROR, 0)) 1547 return err; 1548 } 1549 else 1550 { 1551 /* There is a duplicated node which satisfies the constraint, 1552 use it to avoid infinite loop. */ 1553 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1554 if (BE (ret < 0, 0)) 1555 return REG_ESPACE; 1556 } 1557 1558 org_dest = dfa->edests[org_node].elems[1]; 1559 clone_dest = duplicate_node (dfa, org_dest, constraint); 1560 if (BE (clone_dest == -1, 0)) 1561 return REG_ESPACE; 1562 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1563 if (BE (ret < 0, 0)) 1564 return REG_ESPACE; 1565 } 1566 org_node = org_dest; 1567 clone_node = clone_dest; 1568 } 1569 return REG_NOERROR; 1570} 1571 1572/* Search for a node which is duplicated from the node ORG_NODE, and 1573 satisfies the constraint CONSTRAINT. */ 1574 1575static int 1576search_duplicated_node (const re_dfa_t *dfa, int org_node, 1577 unsigned int constraint) 1578{ 1579 int idx; 1580 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1581 { 1582 if (org_node == dfa->org_indices[idx] 1583 && constraint == dfa->nodes[idx].constraint) 1584 return idx; /* Found. */ 1585 } 1586 return -1; /* Not found. */ 1587} 1588 1589/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1590 Return the index of the new node, or -1 if insufficient storage is 1591 available. */ 1592 1593static int 1594duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint) 1595{ 1596 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1597 if (BE (dup_idx != -1, 1)) 1598 { 1599 dfa->nodes[dup_idx].constraint = constraint; 1600 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1601 dfa->nodes[dup_idx].duplicated = 1; 1602 1603 /* Store the index of the original node. */ 1604 dfa->org_indices[dup_idx] = org_idx; 1605 } 1606 return dup_idx; 1607} 1608 1609static reg_errcode_t 1610calc_inveclosure (re_dfa_t *dfa) 1611{ 1612 int src, idx, ret; 1613 for (idx = 0; idx < dfa->nodes_len; ++idx) 1614 re_node_set_init_empty (dfa->inveclosures + idx); 1615 1616 for (src = 0; src < dfa->nodes_len; ++src) 1617 { 1618 int *elems = dfa->eclosures[src].elems; 1619 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1620 { 1621 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1622 if (BE (ret == -1, 0)) 1623 return REG_ESPACE; 1624 } 1625 } 1626 1627 return REG_NOERROR; 1628} 1629 1630/* Calculate "eclosure" for all the node in DFA. */ 1631 1632static reg_errcode_t 1633calc_eclosure (re_dfa_t *dfa) 1634{ 1635 int node_idx, incomplete; 1636#ifdef DEBUG 1637 assert (dfa->nodes_len > 0); 1638#endif 1639 incomplete = 0; 1640 /* For each nodes, calculate epsilon closure. */ 1641 for (node_idx = 0; ; ++node_idx) 1642 { 1643 reg_errcode_t err; 1644 re_node_set eclosure_elem; 1645 if (node_idx == dfa->nodes_len) 1646 { 1647 if (!incomplete) 1648 break; 1649 incomplete = 0; 1650 node_idx = 0; 1651 } 1652 1653#ifdef DEBUG 1654 assert (dfa->eclosures[node_idx].nelem != -1); 1655#endif 1656 1657 /* If we have already calculated, skip it. */ 1658 if (dfa->eclosures[node_idx].nelem != 0) 1659 continue; 1660 /* Calculate epsilon closure of `node_idx'. */ 1661 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1); 1662 if (BE (err != REG_NOERROR, 0)) 1663 return err; 1664 1665 if (dfa->eclosures[node_idx].nelem == 0) 1666 { 1667 incomplete = 1; 1668 re_node_set_free (&eclosure_elem); 1669 } 1670 } 1671 return REG_NOERROR; 1672} 1673 1674/* Calculate epsilon closure of NODE. */ 1675 1676static reg_errcode_t 1677calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root) 1678{ 1679 reg_errcode_t err; 1680 int i; 1681 re_node_set eclosure; 1682 int ret; 1683 int incomplete = 0; 1684 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1685 if (BE (err != REG_NOERROR, 0)) 1686 return err; 1687 1688 /* This indicates that we are calculating this node now. 1689 We reference this value to avoid infinite loop. */ 1690 dfa->eclosures[node].nelem = -1; 1691 1692 /* If the current node has constraints, duplicate all nodes 1693 since they must inherit the constraints. */ 1694 if (dfa->nodes[node].constraint 1695 && dfa->edests[node].nelem 1696 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1697 { 1698 err = duplicate_node_closure (dfa, node, node, node, 1699 dfa->nodes[node].constraint); 1700 if (BE (err != REG_NOERROR, 0)) 1701 return err; 1702 } 1703 1704 /* Expand each epsilon destination nodes. */ 1705 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1706 for (i = 0; i < dfa->edests[node].nelem; ++i) 1707 { 1708 re_node_set eclosure_elem; 1709 int edest = dfa->edests[node].elems[i]; 1710 /* If calculating the epsilon closure of `edest' is in progress, 1711 return intermediate result. */ 1712 if (dfa->eclosures[edest].nelem == -1) 1713 { 1714 incomplete = 1; 1715 continue; 1716 } 1717 /* If we haven't calculated the epsilon closure of `edest' yet, 1718 calculate now. Otherwise use calculated epsilon closure. */ 1719 if (dfa->eclosures[edest].nelem == 0) 1720 { 1721 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0); 1722 if (BE (err != REG_NOERROR, 0)) 1723 return err; 1724 } 1725 else 1726 eclosure_elem = dfa->eclosures[edest]; 1727 /* Merge the epsilon closure of `edest'. */ 1728 err = re_node_set_merge (&eclosure, &eclosure_elem); 1729 if (BE (err != REG_NOERROR, 0)) 1730 return err; 1731 /* If the epsilon closure of `edest' is incomplete, 1732 the epsilon closure of this node is also incomplete. */ 1733 if (dfa->eclosures[edest].nelem == 0) 1734 { 1735 incomplete = 1; 1736 re_node_set_free (&eclosure_elem); 1737 } 1738 } 1739 1740 /* An epsilon closure includes itself. */ 1741 ret = re_node_set_insert (&eclosure, node); 1742 if (BE (ret < 0, 0)) 1743 return REG_ESPACE; 1744 if (incomplete && !root) 1745 dfa->eclosures[node].nelem = 0; 1746 else 1747 dfa->eclosures[node] = eclosure; 1748 *new_set = eclosure; 1749 return REG_NOERROR; 1750} 1751 1752/* Functions for token which are used in the parser. */ 1753 1754/* Fetch a token from INPUT. 1755 We must not use this function inside bracket expressions. */ 1756 1757static void 1758internal_function 1759fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1760{ 1761 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1762} 1763 1764/* Peek a token from INPUT, and return the length of the token. 1765 We must not use this function inside bracket expressions. */ 1766 1767static int 1768internal_function 1769peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1770{ 1771 unsigned char c; 1772 1773 if (re_string_eoi (input)) 1774 { 1775 token->type = END_OF_RE; 1776 return 0; 1777 } 1778 1779 c = re_string_peek_byte (input, 0); 1780 token->opr.c = c; 1781 1782 token->word_char = 0; 1783#ifdef RE_ENABLE_I18N 1784 token->mb_partial = 0; 1785 if (input->mb_cur_max > 1 && 1786 !re_string_first_byte (input, re_string_cur_idx (input))) 1787 { 1788 token->type = CHARACTER; 1789 token->mb_partial = 1; 1790 return 1; 1791 } 1792#endif 1793 if (c == '\\') 1794 { 1795 unsigned char c2; 1796 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1797 { 1798 token->type = BACK_SLASH; 1799 return 1; 1800 } 1801 1802 c2 = re_string_peek_byte_case (input, 1); 1803 token->opr.c = c2; 1804 token->type = CHARACTER; 1805#ifdef RE_ENABLE_I18N 1806 if (input->mb_cur_max > 1) 1807 { 1808 wint_t wc = re_string_wchar_at (input, 1809 re_string_cur_idx (input) + 1); 1810 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1811 } 1812 else 1813#endif 1814 token->word_char = IS_WORD_CHAR (c2) != 0; 1815 1816 switch (c2) 1817 { 1818 case '|': 1819 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1820 token->type = OP_ALT; 1821 break; 1822 case '1': case '2': case '3': case '4': case '5': 1823 case '6': case '7': case '8': case '9': 1824 if (!(syntax & RE_NO_BK_REFS)) 1825 { 1826 token->type = OP_BACK_REF; 1827 token->opr.idx = c2 - '1'; 1828 } 1829 break; 1830 case '<': 1831 if (!(syntax & RE_NO_GNU_OPS)) 1832 { 1833 token->type = ANCHOR; 1834 token->opr.ctx_type = WORD_FIRST; 1835 } 1836 break; 1837 case '>': 1838 if (!(syntax & RE_NO_GNU_OPS)) 1839 { 1840 token->type = ANCHOR; 1841 token->opr.ctx_type = WORD_LAST; 1842 } 1843 break; 1844 case 'b': 1845 if (!(syntax & RE_NO_GNU_OPS)) 1846 { 1847 token->type = ANCHOR; 1848 token->opr.ctx_type = WORD_DELIM; 1849 } 1850 break; 1851 case 'B': 1852 if (!(syntax & RE_NO_GNU_OPS)) 1853 { 1854 token->type = ANCHOR; 1855 token->opr.ctx_type = NOT_WORD_DELIM; 1856 } 1857 break; 1858 case 'w': 1859 if (!(syntax & RE_NO_GNU_OPS)) 1860 token->type = OP_WORD; 1861 break; 1862 case 'W': 1863 if (!(syntax & RE_NO_GNU_OPS)) 1864 token->type = OP_NOTWORD; 1865 break; 1866 case 's': 1867 if (!(syntax & RE_NO_GNU_OPS)) 1868 token->type = OP_SPACE; 1869 break; 1870 case 'S': 1871 if (!(syntax & RE_NO_GNU_OPS)) 1872 token->type = OP_NOTSPACE; 1873 break; 1874 case '`': 1875 if (!(syntax & RE_NO_GNU_OPS)) 1876 { 1877 token->type = ANCHOR; 1878 token->opr.ctx_type = BUF_FIRST; 1879 } 1880 break; 1881 case '\'': 1882 if (!(syntax & RE_NO_GNU_OPS)) 1883 { 1884 token->type = ANCHOR; 1885 token->opr.ctx_type = BUF_LAST; 1886 } 1887 break; 1888 case '(': 1889 if (!(syntax & RE_NO_BK_PARENS)) 1890 token->type = OP_OPEN_SUBEXP; 1891 break; 1892 case ')': 1893 if (!(syntax & RE_NO_BK_PARENS)) 1894 token->type = OP_CLOSE_SUBEXP; 1895 break; 1896 case '+': 1897 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1898 token->type = OP_DUP_PLUS; 1899 break; 1900 case '?': 1901 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1902 token->type = OP_DUP_QUESTION; 1903 break; 1904 case '{': 1905 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1906 token->type = OP_OPEN_DUP_NUM; 1907 break; 1908 case '}': 1909 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1910 token->type = OP_CLOSE_DUP_NUM; 1911 break; 1912 default: 1913 break; 1914 } 1915 return 2; 1916 } 1917 1918 token->type = CHARACTER; 1919#ifdef RE_ENABLE_I18N 1920 if (input->mb_cur_max > 1) 1921 { 1922 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1923 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1924 } 1925 else 1926#endif 1927 token->word_char = IS_WORD_CHAR (token->opr.c); 1928 1929 switch (c) 1930 { 1931 case '\n': 1932 if (syntax & RE_NEWLINE_ALT) 1933 token->type = OP_ALT; 1934 break; 1935 case '|': 1936 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1937 token->type = OP_ALT; 1938 break; 1939 case '*': 1940 token->type = OP_DUP_ASTERISK; 1941 break; 1942 case '+': 1943 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1944 token->type = OP_DUP_PLUS; 1945 break; 1946 case '?': 1947 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1948 token->type = OP_DUP_QUESTION; 1949 break; 1950 case '{': 1951 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1952 token->type = OP_OPEN_DUP_NUM; 1953 break; 1954 case '}': 1955 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1956 token->type = OP_CLOSE_DUP_NUM; 1957 break; 1958 case '(': 1959 if (syntax & RE_NO_BK_PARENS) 1960 token->type = OP_OPEN_SUBEXP; 1961 break; 1962 case ')': 1963 if (syntax & RE_NO_BK_PARENS) 1964 token->type = OP_CLOSE_SUBEXP; 1965 break; 1966 case '[': 1967 token->type = OP_OPEN_BRACKET; 1968 break; 1969 case '.': 1970 token->type = OP_PERIOD; 1971 break; 1972 case '^': 1973 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 1974 re_string_cur_idx (input) != 0) 1975 { 1976 char prev = re_string_peek_byte (input, -1); 1977 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 1978 break; 1979 } 1980 token->type = ANCHOR; 1981 token->opr.ctx_type = LINE_FIRST; 1982 break; 1983 case '$': 1984 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1985 re_string_cur_idx (input) + 1 != re_string_length (input)) 1986 { 1987 re_token_t next; 1988 re_string_skip_bytes (input, 1); 1989 peek_token (&next, input, syntax); 1990 re_string_skip_bytes (input, -1); 1991 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1992 break; 1993 } 1994 token->type = ANCHOR; 1995 token->opr.ctx_type = LINE_LAST; 1996 break; 1997 default: 1998 break; 1999 } 2000 return 1; 2001} 2002 2003/* Peek a token from INPUT, and return the length of the token. 2004 We must not use this function out of bracket expressions. */ 2005 2006static int 2007internal_function 2008peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2009{ 2010 unsigned char c; 2011 if (re_string_eoi (input)) 2012 { 2013 token->type = END_OF_RE; 2014 return 0; 2015 } 2016 c = re_string_peek_byte (input, 0); 2017 token->opr.c = c; 2018 2019#ifdef RE_ENABLE_I18N 2020 if (input->mb_cur_max > 1 && 2021 !re_string_first_byte (input, re_string_cur_idx (input))) 2022 { 2023 token->type = CHARACTER; 2024 return 1; 2025 } 2026#endif /* RE_ENABLE_I18N */ 2027 2028 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2029 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2030 { 2031 /* In this case, '\' escape a character. */ 2032 unsigned char c2; 2033 re_string_skip_bytes (input, 1); 2034 c2 = re_string_peek_byte (input, 0); 2035 token->opr.c = c2; 2036 token->type = CHARACTER; 2037 return 1; 2038 } 2039 if (c == '[') /* '[' is a special char in a bracket exps. */ 2040 { 2041 unsigned char c2; 2042 int token_len; 2043 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2044 c2 = re_string_peek_byte (input, 1); 2045 else 2046 c2 = 0; 2047 token->opr.c = c2; 2048 token_len = 2; 2049 switch (c2) 2050 { 2051 case '.': 2052 token->type = OP_OPEN_COLL_ELEM; 2053 break; 2054 case '=': 2055 token->type = OP_OPEN_EQUIV_CLASS; 2056 break; 2057 case ':': 2058 if (syntax & RE_CHAR_CLASSES) 2059 { 2060 token->type = OP_OPEN_CHAR_CLASS; 2061 break; 2062 } 2063 /* else fall through. */ 2064 default: 2065 token->type = CHARACTER; 2066 token->opr.c = c; 2067 token_len = 1; 2068 break; 2069 } 2070 return token_len; 2071 } 2072 switch (c) 2073 { 2074 case '-': 2075 token->type = OP_CHARSET_RANGE; 2076 break; 2077 case ']': 2078 token->type = OP_CLOSE_BRACKET; 2079 break; 2080 case '^': 2081 token->type = OP_NON_MATCH_LIST; 2082 break; 2083 default: 2084 token->type = CHARACTER; 2085 } 2086 return 1; 2087} 2088 2089/* Functions for parser. */ 2090 2091/* Entry point of the parser. 2092 Parse the regular expression REGEXP and return the structure tree. 2093 If an error is occured, ERR is set by error code, and return NULL. 2094 This function build the following tree, from regular expression <reg_exp>: 2095 CAT 2096 / \ 2097 / \ 2098 <reg_exp> EOR 2099 2100 CAT means concatenation. 2101 EOR means end of regular expression. */ 2102 2103static bin_tree_t * 2104parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2105 reg_errcode_t *err) 2106{ 2107 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2108 bin_tree_t *tree, *eor, *root; 2109 re_token_t current_token; 2110 dfa->syntax = syntax; 2111 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2112 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2113 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2114 return NULL; 2115 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2116 if (tree != NULL) 2117 root = create_tree (dfa, tree, eor, CONCAT); 2118 else 2119 root = eor; 2120 if (BE (eor == NULL || root == NULL, 0)) 2121 { 2122 *err = REG_ESPACE; 2123 return NULL; 2124 } 2125 return root; 2126} 2127 2128/* This function build the following tree, from regular expression 2129 <branch1>|<branch2>: 2130 ALT 2131 / \ 2132 / \ 2133 <branch1> <branch2> 2134 2135 ALT means alternative, which represents the operator `|'. */ 2136 2137static bin_tree_t * 2138parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2139 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2140{ 2141 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2142 bin_tree_t *tree, *branch = NULL; 2143 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2144 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2145 return NULL; 2146 2147 while (token->type == OP_ALT) 2148 { 2149 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2150 if (token->type != OP_ALT && token->type != END_OF_RE 2151 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2152 { 2153 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2154 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2155 return NULL; 2156 } 2157 else 2158 branch = NULL; 2159 tree = create_tree (dfa, tree, branch, OP_ALT); 2160 if (BE (tree == NULL, 0)) 2161 { 2162 *err = REG_ESPACE; 2163 return NULL; 2164 } 2165 } 2166 return tree; 2167} 2168 2169/* This function build the following tree, from regular expression 2170 <exp1><exp2>: 2171 CAT 2172 / \ 2173 / \ 2174 <exp1> <exp2> 2175 2176 CAT means concatenation. */ 2177 2178static bin_tree_t * 2179parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2180 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2181{ 2182 bin_tree_t *tree, *exp; 2183 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2184 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2185 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2186 return NULL; 2187 2188 while (token->type != OP_ALT && token->type != END_OF_RE 2189 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2190 { 2191 exp = parse_expression (regexp, preg, token, syntax, nest, err); 2192 if (BE (*err != REG_NOERROR && exp == NULL, 0)) 2193 { 2194 if (tree != NULL) 2195 postorder (tree, free_tree, NULL); 2196 return NULL; 2197 } 2198 if (tree != NULL && exp != NULL) 2199 { 2200 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT); 2201 if (newtree == NULL) 2202 { 2203 postorder (exp, free_tree, NULL); 2204 postorder (tree, free_tree, NULL); 2205 *err = REG_ESPACE; 2206 return NULL; 2207 } 2208 tree = newtree; 2209 } 2210 else if (tree == NULL) 2211 tree = exp; 2212 /* Otherwise exp == NULL, we don't need to create new tree. */ 2213 } 2214 return tree; 2215} 2216 2217/* This function build the following tree, from regular expression a*: 2218 * 2219 | 2220 a 2221*/ 2222 2223static bin_tree_t * 2224parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2225 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2226{ 2227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2228 bin_tree_t *tree; 2229 switch (token->type) 2230 { 2231 case CHARACTER: 2232 tree = create_token_tree (dfa, NULL, NULL, token); 2233 if (BE (tree == NULL, 0)) 2234 { 2235 *err = REG_ESPACE; 2236 return NULL; 2237 } 2238#ifdef RE_ENABLE_I18N 2239 if (dfa->mb_cur_max > 1) 2240 { 2241 while (!re_string_eoi (regexp) 2242 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2243 { 2244 bin_tree_t *mbc_remain; 2245 fetch_token (token, regexp, syntax); 2246 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2247 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2248 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2249 { 2250 *err = REG_ESPACE; 2251 return NULL; 2252 } 2253 } 2254 } 2255#endif 2256 break; 2257 case OP_OPEN_SUBEXP: 2258 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2259 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2260 return NULL; 2261 break; 2262 case OP_OPEN_BRACKET: 2263 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2264 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2265 return NULL; 2266 break; 2267 case OP_BACK_REF: 2268 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2269 { 2270 *err = REG_ESUBREG; 2271 return NULL; 2272 } 2273 dfa->used_bkref_map |= 1 << token->opr.idx; 2274 tree = create_token_tree (dfa, NULL, NULL, token); 2275 if (BE (tree == NULL, 0)) 2276 { 2277 *err = REG_ESPACE; 2278 return NULL; 2279 } 2280 ++dfa->nbackref; 2281 dfa->has_mb_node = 1; 2282 break; 2283 case OP_OPEN_DUP_NUM: 2284 if (syntax & RE_CONTEXT_INVALID_DUP) 2285 { 2286 *err = REG_BADRPT; 2287 return NULL; 2288 } 2289 /* FALLTHROUGH */ 2290 case OP_DUP_ASTERISK: 2291 case OP_DUP_PLUS: 2292 case OP_DUP_QUESTION: 2293 if (syntax & RE_CONTEXT_INVALID_OPS) 2294 { 2295 *err = REG_BADRPT; 2296 return NULL; 2297 } 2298 else if (syntax & RE_CONTEXT_INDEP_OPS) 2299 { 2300 fetch_token (token, regexp, syntax); 2301 return parse_expression (regexp, preg, token, syntax, nest, err); 2302 } 2303 /* else fall through */ 2304 case OP_CLOSE_SUBEXP: 2305 if ((token->type == OP_CLOSE_SUBEXP) && 2306 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2307 { 2308 *err = REG_ERPAREN; 2309 return NULL; 2310 } 2311 /* else fall through */ 2312 case OP_CLOSE_DUP_NUM: 2313 /* We treat it as a normal character. */ 2314 2315 /* Then we can these characters as normal characters. */ 2316 token->type = CHARACTER; 2317 /* mb_partial and word_char bits should be initialized already 2318 by peek_token. */ 2319 tree = create_token_tree (dfa, NULL, NULL, token); 2320 if (BE (tree == NULL, 0)) 2321 { 2322 *err = REG_ESPACE; 2323 return NULL; 2324 } 2325 break; 2326 case ANCHOR: 2327 if ((token->opr.ctx_type 2328 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2329 && dfa->word_ops_used == 0) 2330 init_word_char (dfa); 2331 if (token->opr.ctx_type == WORD_DELIM 2332 || token->opr.ctx_type == NOT_WORD_DELIM) 2333 { 2334 bin_tree_t *tree_first, *tree_last; 2335 if (token->opr.ctx_type == WORD_DELIM) 2336 { 2337 token->opr.ctx_type = WORD_FIRST; 2338 tree_first = create_token_tree (dfa, NULL, NULL, token); 2339 token->opr.ctx_type = WORD_LAST; 2340 } 2341 else 2342 { 2343 token->opr.ctx_type = INSIDE_WORD; 2344 tree_first = create_token_tree (dfa, NULL, NULL, token); 2345 token->opr.ctx_type = INSIDE_NOTWORD; 2346 } 2347 tree_last = create_token_tree (dfa, NULL, NULL, token); 2348 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2349 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2350 { 2351 *err = REG_ESPACE; 2352 return NULL; 2353 } 2354 } 2355 else 2356 { 2357 tree = create_token_tree (dfa, NULL, NULL, token); 2358 if (BE (tree == NULL, 0)) 2359 { 2360 *err = REG_ESPACE; 2361 return NULL; 2362 } 2363 } 2364 /* We must return here, since ANCHORs can't be followed 2365 by repetition operators. 2366 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2367 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2368 fetch_token (token, regexp, syntax); 2369 return tree; 2370 case OP_PERIOD: 2371 tree = create_token_tree (dfa, NULL, NULL, token); 2372 if (BE (tree == NULL, 0)) 2373 { 2374 *err = REG_ESPACE; 2375 return NULL; 2376 } 2377 if (dfa->mb_cur_max > 1) 2378 dfa->has_mb_node = 1; 2379 break; 2380 case OP_WORD: 2381 case OP_NOTWORD: 2382 tree = build_charclass_op (dfa, regexp->trans, 2383 (const unsigned char *) "alnum", 2384 (const unsigned char *) "_", 2385 token->type == OP_NOTWORD, err); 2386 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2387 return NULL; 2388 break; 2389 case OP_SPACE: 2390 case OP_NOTSPACE: 2391 tree = build_charclass_op (dfa, regexp->trans, 2392 (const unsigned char *) "space", 2393 (const unsigned char *) "", 2394 token->type == OP_NOTSPACE, err); 2395 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2396 return NULL; 2397 break; 2398 case OP_ALT: 2399 case END_OF_RE: 2400 return NULL; 2401 case BACK_SLASH: 2402 *err = REG_EESCAPE; 2403 return NULL; 2404 default: 2405 /* Must not happen? */ 2406#ifdef DEBUG 2407 assert (0); 2408#endif 2409 return NULL; 2410 } 2411 fetch_token (token, regexp, syntax); 2412 2413 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2414 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2415 { 2416 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2417 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2418 return NULL; 2419 /* In BRE consecutive duplications are not allowed. */ 2420 if ((syntax & RE_CONTEXT_INVALID_DUP) 2421 && (token->type == OP_DUP_ASTERISK 2422 || token->type == OP_OPEN_DUP_NUM)) 2423 { 2424 *err = REG_BADRPT; 2425 return NULL; 2426 } 2427 } 2428 2429 return tree; 2430} 2431 2432/* This function build the following tree, from regular expression 2433 (<reg_exp>): 2434 SUBEXP 2435 | 2436 <reg_exp> 2437*/ 2438 2439static bin_tree_t * 2440parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2441 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2442{ 2443 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2444 bin_tree_t *tree; 2445 size_t cur_nsub; 2446 cur_nsub = preg->re_nsub++; 2447 2448 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2449 2450 /* The subexpression may be a null string. */ 2451 if (token->type == OP_CLOSE_SUBEXP) 2452 tree = NULL; 2453 else 2454 { 2455 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2456 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2457 { 2458 if (tree != NULL) 2459 postorder (tree, free_tree, NULL); 2460 *err = REG_EPAREN; 2461 } 2462 if (BE (*err != REG_NOERROR, 0)) 2463 return NULL; 2464 } 2465 2466 if (cur_nsub <= '9' - '1') 2467 dfa->completed_bkref_map |= 1 << cur_nsub; 2468 2469 tree = create_tree (dfa, tree, NULL, SUBEXP); 2470 if (BE (tree == NULL, 0)) 2471 { 2472 *err = REG_ESPACE; 2473 return NULL; 2474 } 2475 tree->token.opr.idx = cur_nsub; 2476 return tree; 2477} 2478 2479/* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2480 2481static bin_tree_t * 2482parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2483 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2484{ 2485 bin_tree_t *tree = NULL, *old_tree = NULL; 2486 int i, start, end, start_idx = re_string_cur_idx (regexp); 2487 re_token_t start_token = *token; 2488 2489 if (token->type == OP_OPEN_DUP_NUM) 2490 { 2491 end = 0; 2492 start = fetch_number (regexp, token, syntax); 2493 if (start == -1) 2494 { 2495 if (token->type == CHARACTER && token->opr.c == ',') 2496 start = 0; /* We treat "{,m}" as "{0,m}". */ 2497 else 2498 { 2499 *err = REG_BADBR; /* <re>{} is invalid. */ 2500 return NULL; 2501 } 2502 } 2503 if (BE (start != -2, 1)) 2504 { 2505 /* We treat "{n}" as "{n,n}". */ 2506 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2507 : ((token->type == CHARACTER && token->opr.c == ',') 2508 ? fetch_number (regexp, token, syntax) : -2)); 2509 } 2510 if (BE (start == -2 || end == -2, 0)) 2511 { 2512 /* Invalid sequence. */ 2513 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2514 { 2515 if (token->type == END_OF_RE) 2516 *err = REG_EBRACE; 2517 else 2518 *err = REG_BADBR; 2519 2520 return NULL; 2521 } 2522 2523 /* If the syntax bit is set, rollback. */ 2524 re_string_set_index (regexp, start_idx); 2525 *token = start_token; 2526 token->type = CHARACTER; 2527 /* mb_partial and word_char bits should be already initialized by 2528 peek_token. */ 2529 return elem; 2530 } 2531 2532 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0)) 2533 { 2534 /* First number greater than second. */ 2535 *err = REG_BADBR; 2536 return NULL; 2537 } 2538 } 2539 else 2540 { 2541 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2542 end = (token->type == OP_DUP_QUESTION) ? 1 : -1; 2543 } 2544 2545 fetch_token (token, regexp, syntax); 2546 2547 if (BE (elem == NULL, 0)) 2548 return NULL; 2549 if (BE (start == 0 && end == 0, 0)) 2550 { 2551 postorder (elem, free_tree, NULL); 2552 return NULL; 2553 } 2554 2555 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2556 if (BE (start > 0, 0)) 2557 { 2558 tree = elem; 2559 for (i = 2; i <= start; ++i) 2560 { 2561 elem = duplicate_tree (elem, dfa); 2562 tree = create_tree (dfa, tree, elem, CONCAT); 2563 if (BE (elem == NULL || tree == NULL, 0)) 2564 goto parse_dup_op_espace; 2565 } 2566 2567 if (start == end) 2568 return tree; 2569 2570 /* Duplicate ELEM before it is marked optional. */ 2571 elem = duplicate_tree (elem, dfa); 2572 old_tree = tree; 2573 } 2574 else 2575 old_tree = NULL; 2576 2577 if (elem->token.type == SUBEXP) 2578 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2579 2580 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT)); 2581 if (BE (tree == NULL, 0)) 2582 goto parse_dup_op_espace; 2583 2584 /* This loop is actually executed only when end != -1, 2585 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2586 already created the start+1-th copy. */ 2587 for (i = start + 2; i <= end; ++i) 2588 { 2589 elem = duplicate_tree (elem, dfa); 2590 tree = create_tree (dfa, tree, elem, CONCAT); 2591 if (BE (elem == NULL || tree == NULL, 0)) 2592 goto parse_dup_op_espace; 2593 2594 tree = create_tree (dfa, tree, NULL, OP_ALT); 2595 if (BE (tree == NULL, 0)) 2596 goto parse_dup_op_espace; 2597 } 2598 2599 if (old_tree) 2600 tree = create_tree (dfa, old_tree, tree, CONCAT); 2601 2602 return tree; 2603 2604 parse_dup_op_espace: 2605 *err = REG_ESPACE; 2606 return NULL; 2607} 2608 2609/* Size of the names for collating symbol/equivalence_class/character_class. 2610 I'm not sure, but maybe enough. */ 2611#define BRACKET_NAME_BUF_SIZE 32 2612 2613#ifndef _LIBC 2614 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2615 Build the range expression which starts from START_ELEM, and ends 2616 at END_ELEM. The result are written to MBCSET and SBCSET. 2617 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2618 mbcset->range_ends, is a pointer argument sinse we may 2619 update it. */ 2620 2621static reg_errcode_t 2622internal_function 2623# ifdef RE_ENABLE_I18N 2624build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, 2625 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2626# else /* not RE_ENABLE_I18N */ 2627build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, 2628 bracket_elem_t *end_elem) 2629# endif /* not RE_ENABLE_I18N */ 2630{ 2631 unsigned int start_ch, end_ch; 2632 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2633 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2634 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2635 0)) 2636 return REG_ERANGE; 2637 2638 /* We can handle no multi character collating elements without libc 2639 support. */ 2640 if (BE ((start_elem->type == COLL_SYM 2641 && strlen ((char *) start_elem->opr.name) > 1) 2642 || (end_elem->type == COLL_SYM 2643 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2644 return REG_ECOLLATE; 2645 2646# ifdef RE_ENABLE_I18N 2647 { 2648 wchar_t wc; 2649 wint_t start_wc; 2650 wint_t end_wc; 2651 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2652 2653 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2654 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2655 : 0)); 2656 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2657 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2658 : 0)); 2659 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2660 ? __btowc (start_ch) : start_elem->opr.wch); 2661 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2662 ? __btowc (end_ch) : end_elem->opr.wch); 2663 if (start_wc == WEOF || end_wc == WEOF) 2664 return REG_ECOLLATE; 2665 cmp_buf[0] = start_wc; 2666 cmp_buf[4] = end_wc; 2667 if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2668 return REG_ERANGE; 2669 2670 /* Got valid collation sequence values, add them as a new entry. 2671 However, for !_LIBC we have no collation elements: if the 2672 character set is single byte, the single byte character set 2673 that we build below suffices. parse_bracket_exp passes 2674 no MBCSET if dfa->mb_cur_max == 1. */ 2675 if (mbcset) 2676 { 2677 /* Check the space of the arrays. */ 2678 if (BE (*range_alloc == mbcset->nranges, 0)) 2679 { 2680 /* There is not enough space, need realloc. */ 2681 wchar_t *new_array_start, *new_array_end; 2682 int new_nranges; 2683 2684 /* +1 in case of mbcset->nranges is 0. */ 2685 new_nranges = 2 * mbcset->nranges + 1; 2686 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2687 are NULL if *range_alloc == 0. */ 2688 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2689 new_nranges); 2690 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2691 new_nranges); 2692 2693 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2694 return REG_ESPACE; 2695 2696 mbcset->range_starts = new_array_start; 2697 mbcset->range_ends = new_array_end; 2698 *range_alloc = new_nranges; 2699 } 2700 2701 mbcset->range_starts[mbcset->nranges] = start_wc; 2702 mbcset->range_ends[mbcset->nranges++] = end_wc; 2703 } 2704 2705 /* Build the table for single byte characters. */ 2706 for (wc = 0; wc < SBC_MAX; ++wc) 2707 { 2708 cmp_buf[2] = wc; 2709 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2710 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2711 bitset_set (sbcset, wc); 2712 } 2713 } 2714# else /* not RE_ENABLE_I18N */ 2715 { 2716 unsigned int ch; 2717 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2718 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2719 : 0)); 2720 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2721 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2722 : 0)); 2723 if (start_ch > end_ch) 2724 return REG_ERANGE; 2725 /* Build the table for single byte characters. */ 2726 for (ch = 0; ch < SBC_MAX; ++ch) 2727 if (start_ch <= ch && ch <= end_ch) 2728 bitset_set (sbcset, ch); 2729 } 2730# endif /* not RE_ENABLE_I18N */ 2731 return REG_NOERROR; 2732} 2733#endif /* not _LIBC */ 2734 2735#ifndef _LIBC 2736/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2737 Build the collating element which is represented by NAME. 2738 The result are written to MBCSET and SBCSET. 2739 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2740 pointer argument since we may update it. */ 2741 2742static reg_errcode_t 2743internal_function 2744# ifdef RE_ENABLE_I18N 2745build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 2746 int *coll_sym_alloc, const unsigned char *name) 2747# else /* not RE_ENABLE_I18N */ 2748build_collating_symbol (bitset_t sbcset, const unsigned char *name) 2749# endif /* not RE_ENABLE_I18N */ 2750{ 2751 size_t name_len = strlen ((const char *) name); 2752 if (BE (name_len != 1, 0)) 2753 return REG_ECOLLATE; 2754 else 2755 { 2756 bitset_set (sbcset, name[0]); 2757 return REG_NOERROR; 2758 } 2759} 2760#endif /* not _LIBC */ 2761 2762/* This function parse bracket expression like "[abc]", "[a-c]", 2763 "[[.a-a.]]" etc. */ 2764 2765static bin_tree_t * 2766parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2767 reg_syntax_t syntax, reg_errcode_t *err) 2768{ 2769#ifdef _LIBC 2770 const unsigned char *collseqmb; 2771 const char *collseqwc; 2772 uint32_t nrules; 2773 int32_t table_size; 2774 const int32_t *symb_table; 2775 const unsigned char *extra; 2776 2777 /* Local function for parse_bracket_exp used in _LIBC environement. 2778 Seek the collating symbol entry correspondings to NAME. 2779 Return the index of the symbol in the SYMB_TABLE, 2780 or -1 if not found. */ 2781 2782 auto inline int32_t 2783 __attribute ((always_inline)) 2784 seek_collating_symbol_entry (const unsigned char *name, size_t name_len) 2785 { 2786 int32_t elem; 2787 2788 for (elem = 0; elem < table_size; elem++) 2789 if (symb_table[2 * elem] != 0) 2790 { 2791 int32_t idx = symb_table[2 * elem + 1]; 2792 /* Skip the name of collating element name. */ 2793 idx += 1 + extra[idx]; 2794 if (/* Compare the length of the name. */ 2795 name_len == extra[idx] 2796 /* Compare the name. */ 2797 && memcmp (name, &extra[idx + 1], name_len) == 0) 2798 /* Yep, this is the entry. */ 2799 return elem; 2800 } 2801 return -1; 2802 } 2803 2804 /* Local function for parse_bracket_exp used in _LIBC environment. 2805 Look up the collation sequence value of BR_ELEM. 2806 Return the value if succeeded, UINT_MAX otherwise. */ 2807 2808 auto inline unsigned int 2809 __attribute ((always_inline)) 2810 lookup_collation_sequence_value (bracket_elem_t *br_elem) 2811 { 2812 if (br_elem->type == SB_CHAR) 2813 { 2814 /* 2815 if (MB_CUR_MAX == 1) 2816 */ 2817 if (nrules == 0) 2818 return collseqmb[br_elem->opr.ch]; 2819 else 2820 { 2821 wint_t wc = __btowc (br_elem->opr.ch); 2822 return __collseq_table_lookup (collseqwc, wc); 2823 } 2824 } 2825 else if (br_elem->type == MB_CHAR) 2826 { 2827 if (nrules != 0) 2828 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2829 } 2830 else if (br_elem->type == COLL_SYM) 2831 { 2832 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2833 if (nrules != 0) 2834 { 2835 int32_t elem, idx; 2836 elem = seek_collating_symbol_entry (br_elem->opr.name, 2837 sym_name_len); 2838 if (elem != -1) 2839 { 2840 /* We found the entry. */ 2841 idx = symb_table[2 * elem + 1]; 2842 /* Skip the name of collating element name. */ 2843 idx += 1 + extra[idx]; 2844 /* Skip the byte sequence of the collating element. */ 2845 idx += 1 + extra[idx]; 2846 /* Adjust for the alignment. */ 2847 idx = (idx + 3) & ~3; 2848 /* Skip the multibyte collation sequence value. */ 2849 idx += sizeof (unsigned int); 2850 /* Skip the wide char sequence of the collating element. */ 2851 idx += sizeof (unsigned int) * 2852 (1 + *(unsigned int *) (extra + idx)); 2853 /* Return the collation sequence value. */ 2854 return *(unsigned int *) (extra + idx); 2855 } 2856 else if (sym_name_len == 1) 2857 { 2858 /* No valid character. Match it as a single byte 2859 character. */ 2860 return collseqmb[br_elem->opr.name[0]]; 2861 } 2862 } 2863 else if (sym_name_len == 1) 2864 return collseqmb[br_elem->opr.name[0]]; 2865 } 2866 return UINT_MAX; 2867 } 2868 2869 /* Local function for parse_bracket_exp used in _LIBC environement. 2870 Build the range expression which starts from START_ELEM, and ends 2871 at END_ELEM. The result are written to MBCSET and SBCSET. 2872 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2873 mbcset->range_ends, is a pointer argument sinse we may 2874 update it. */ 2875 2876 auto inline reg_errcode_t 2877 __attribute ((always_inline)) 2878 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, 2879 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2880 { 2881 unsigned int ch; 2882 uint32_t start_collseq; 2883 uint32_t end_collseq; 2884 2885 /* Equivalence Classes and Character Classes can't be a range 2886 start/end. */ 2887 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2888 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2889 0)) 2890 return REG_ERANGE; 2891 2892 start_collseq = lookup_collation_sequence_value (start_elem); 2893 end_collseq = lookup_collation_sequence_value (end_elem); 2894 /* Check start/end collation sequence values. */ 2895 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2896 return REG_ECOLLATE; 2897 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2898 return REG_ERANGE; 2899 2900 /* Got valid collation sequence values, add them as a new entry. 2901 However, if we have no collation elements, and the character set 2902 is single byte, the single byte character set that we 2903 build below suffices. */ 2904 if (nrules > 0 || dfa->mb_cur_max > 1) 2905 { 2906 /* Check the space of the arrays. */ 2907 if (BE (*range_alloc == mbcset->nranges, 0)) 2908 { 2909 /* There is not enough space, need realloc. */ 2910 uint32_t *new_array_start; 2911 uint32_t *new_array_end; 2912 int new_nranges; 2913 2914 /* +1 in case of mbcset->nranges is 0. */ 2915 new_nranges = 2 * mbcset->nranges + 1; 2916 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2917 new_nranges); 2918 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2919 new_nranges); 2920 2921 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2922 return REG_ESPACE; 2923 2924 mbcset->range_starts = new_array_start; 2925 mbcset->range_ends = new_array_end; 2926 *range_alloc = new_nranges; 2927 } 2928 2929 mbcset->range_starts[mbcset->nranges] = start_collseq; 2930 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2931 } 2932 2933 /* Build the table for single byte characters. */ 2934 for (ch = 0; ch < SBC_MAX; ch++) 2935 { 2936 uint32_t ch_collseq; 2937 /* 2938 if (MB_CUR_MAX == 1) 2939 */ 2940 if (nrules == 0) 2941 ch_collseq = collseqmb[ch]; 2942 else 2943 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2944 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2945 bitset_set (sbcset, ch); 2946 } 2947 return REG_NOERROR; 2948 } 2949 2950 /* Local function for parse_bracket_exp used in _LIBC environement. 2951 Build the collating element which is represented by NAME. 2952 The result are written to MBCSET and SBCSET. 2953 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2954 pointer argument sinse we may update it. */ 2955 2956 auto inline reg_errcode_t 2957 __attribute ((always_inline)) 2958 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 2959 int *coll_sym_alloc, const unsigned char *name) 2960 { 2961 int32_t elem, idx; 2962 size_t name_len = strlen ((const char *) name); 2963 if (nrules != 0) 2964 { 2965 elem = seek_collating_symbol_entry (name, name_len); 2966 if (elem != -1) 2967 { 2968 /* We found the entry. */ 2969 idx = symb_table[2 * elem + 1]; 2970 /* Skip the name of collating element name. */ 2971 idx += 1 + extra[idx]; 2972 } 2973 else if (name_len == 1) 2974 { 2975 /* No valid character, treat it as a normal 2976 character. */ 2977 bitset_set (sbcset, name[0]); 2978 return REG_NOERROR; 2979 } 2980 else 2981 return REG_ECOLLATE; 2982 2983 /* Got valid collation sequence, add it as a new entry. */ 2984 /* Check the space of the arrays. */ 2985 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 2986 { 2987 /* Not enough, realloc it. */ 2988 /* +1 in case of mbcset->ncoll_syms is 0. */ 2989 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 2990 /* Use realloc since mbcset->coll_syms is NULL 2991 if *alloc == 0. */ 2992 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 2993 new_coll_sym_alloc); 2994 if (BE (new_coll_syms == NULL, 0)) 2995 return REG_ESPACE; 2996 mbcset->coll_syms = new_coll_syms; 2997 *coll_sym_alloc = new_coll_sym_alloc; 2998 } 2999 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3000 return REG_NOERROR; 3001 } 3002 else 3003 { 3004 if (BE (name_len != 1, 0)) 3005 return REG_ECOLLATE; 3006 else 3007 { 3008 bitset_set (sbcset, name[0]); 3009 return REG_NOERROR; 3010 } 3011 } 3012 } 3013#endif 3014 3015 re_token_t br_token; 3016 re_bitset_ptr_t sbcset; 3017#ifdef RE_ENABLE_I18N 3018 re_charset_t *mbcset; 3019 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3020 int equiv_class_alloc = 0, char_class_alloc = 0; 3021#endif /* not RE_ENABLE_I18N */ 3022 int non_match = 0; 3023 bin_tree_t *work_tree; 3024 int token_len; 3025 int first_round = 1; 3026#ifdef _LIBC 3027 collseqmb = (const unsigned char *) 3028 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3029 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3030 if (nrules) 3031 { 3032 /* 3033 if (MB_CUR_MAX > 1) 3034 */ 3035 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3036 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3037 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3038 _NL_COLLATE_SYMB_TABLEMB); 3039 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3040 _NL_COLLATE_SYMB_EXTRAMB); 3041 } 3042#endif 3043 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3044#ifdef RE_ENABLE_I18N 3045 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3046#endif /* RE_ENABLE_I18N */ 3047#ifdef RE_ENABLE_I18N 3048 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3049#else 3050 if (BE (sbcset == NULL, 0)) 3051#endif /* RE_ENABLE_I18N */ 3052 { 3053 re_free (sbcset); 3054#ifdef RE_ENABLE_I18N 3055 re_free (mbcset); 3056#endif 3057 *err = REG_ESPACE; 3058 return NULL; 3059 } 3060 3061 token_len = peek_token_bracket (token, regexp, syntax); 3062 if (BE (token->type == END_OF_RE, 0)) 3063 { 3064 *err = REG_BADPAT; 3065 goto parse_bracket_exp_free_return; 3066 } 3067 if (token->type == OP_NON_MATCH_LIST) 3068 { 3069#ifdef RE_ENABLE_I18N 3070 mbcset->non_match = 1; 3071#endif /* not RE_ENABLE_I18N */ 3072 non_match = 1; 3073 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3074 bitset_set (sbcset, '\n'); 3075 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3076 token_len = peek_token_bracket (token, regexp, syntax); 3077 if (BE (token->type == END_OF_RE, 0)) 3078 { 3079 *err = REG_BADPAT; 3080 goto parse_bracket_exp_free_return; 3081 } 3082 } 3083 3084 /* We treat the first ']' as a normal character. */ 3085 if (token->type == OP_CLOSE_BRACKET) 3086 token->type = CHARACTER; 3087 3088 while (1) 3089 { 3090 bracket_elem_t start_elem, end_elem; 3091 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3092 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3093 reg_errcode_t ret; 3094 int token_len2 = 0, is_range_exp = 0; 3095 re_token_t token2; 3096 3097 start_elem.opr.name = start_name_buf; 3098 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3099 syntax, first_round); 3100 if (BE (ret != REG_NOERROR, 0)) 3101 { 3102 *err = ret; 3103 goto parse_bracket_exp_free_return; 3104 } 3105 first_round = 0; 3106 3107 /* Get information about the next token. We need it in any case. */ 3108 token_len = peek_token_bracket (token, regexp, syntax); 3109 3110 /* Do not check for ranges if we know they are not allowed. */ 3111 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3112 { 3113 if (BE (token->type == END_OF_RE, 0)) 3114 { 3115 *err = REG_EBRACK; 3116 goto parse_bracket_exp_free_return; 3117 } 3118 if (token->type == OP_CHARSET_RANGE) 3119 { 3120 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3121 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3122 if (BE (token2.type == END_OF_RE, 0)) 3123 { 3124 *err = REG_EBRACK; 3125 goto parse_bracket_exp_free_return; 3126 } 3127 if (token2.type == OP_CLOSE_BRACKET) 3128 { 3129 /* We treat the last '-' as a normal character. */ 3130 re_string_skip_bytes (regexp, -token_len); 3131 token->type = CHARACTER; 3132 } 3133 else 3134 is_range_exp = 1; 3135 } 3136 } 3137 3138 if (is_range_exp == 1) 3139 { 3140 end_elem.opr.name = end_name_buf; 3141 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3142 dfa, syntax, 1); 3143 if (BE (ret != REG_NOERROR, 0)) 3144 { 3145 *err = ret; 3146 goto parse_bracket_exp_free_return; 3147 } 3148 3149 token_len = peek_token_bracket (token, regexp, syntax); 3150 3151#ifdef _LIBC 3152 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3153 &start_elem, &end_elem); 3154#else 3155# ifdef RE_ENABLE_I18N 3156 *err = build_range_exp (sbcset, 3157 dfa->mb_cur_max > 1 ? mbcset : NULL, 3158 &range_alloc, &start_elem, &end_elem); 3159# else 3160 *err = build_range_exp (sbcset, &start_elem, &end_elem); 3161# endif 3162#endif /* RE_ENABLE_I18N */ 3163 if (BE (*err != REG_NOERROR, 0)) 3164 goto parse_bracket_exp_free_return; 3165 } 3166 else 3167 { 3168 switch (start_elem.type) 3169 { 3170 case SB_CHAR: 3171 bitset_set (sbcset, start_elem.opr.ch); 3172 break; 3173#ifdef RE_ENABLE_I18N 3174 case MB_CHAR: 3175 /* Check whether the array has enough space. */ 3176 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3177 { 3178 wchar_t *new_mbchars; 3179 /* Not enough, realloc it. */ 3180 /* +1 in case of mbcset->nmbchars is 0. */ 3181 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3182 /* Use realloc since array is NULL if *alloc == 0. */ 3183 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3184 mbchar_alloc); 3185 if (BE (new_mbchars == NULL, 0)) 3186 goto parse_bracket_exp_espace; 3187 mbcset->mbchars = new_mbchars; 3188 } 3189 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3190 break; 3191#endif /* RE_ENABLE_I18N */ 3192 case EQUIV_CLASS: 3193 *err = build_equiv_class (sbcset, 3194#ifdef RE_ENABLE_I18N 3195 mbcset, &equiv_class_alloc, 3196#endif /* RE_ENABLE_I18N */ 3197 start_elem.opr.name); 3198 if (BE (*err != REG_NOERROR, 0)) 3199 goto parse_bracket_exp_free_return; 3200 break; 3201 case COLL_SYM: 3202 *err = build_collating_symbol (sbcset, 3203#ifdef RE_ENABLE_I18N 3204 mbcset, &coll_sym_alloc, 3205#endif /* RE_ENABLE_I18N */ 3206 start_elem.opr.name); 3207 if (BE (*err != REG_NOERROR, 0)) 3208 goto parse_bracket_exp_free_return; 3209 break; 3210 case CHAR_CLASS: 3211 *err = build_charclass (regexp->trans, sbcset, 3212#ifdef RE_ENABLE_I18N 3213 mbcset, &char_class_alloc, 3214#endif /* RE_ENABLE_I18N */ 3215 start_elem.opr.name, syntax); 3216 if (BE (*err != REG_NOERROR, 0)) 3217 goto parse_bracket_exp_free_return; 3218 break; 3219 default: 3220 assert (0); 3221 break; 3222 } 3223 } 3224 if (BE (token->type == END_OF_RE, 0)) 3225 { 3226 *err = REG_EBRACK; 3227 goto parse_bracket_exp_free_return; 3228 } 3229 if (token->type == OP_CLOSE_BRACKET) 3230 break; 3231 } 3232 3233 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3234 3235 /* If it is non-matching list. */ 3236 if (non_match) 3237 bitset_not (sbcset); 3238 3239#ifdef RE_ENABLE_I18N 3240 /* Ensure only single byte characters are set. */ 3241 if (dfa->mb_cur_max > 1) 3242 bitset_mask (sbcset, dfa->sb_char); 3243 3244 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3245 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3246 || mbcset->non_match))) 3247 { 3248 bin_tree_t *mbc_tree; 3249 int sbc_idx; 3250 /* Build a tree for complex bracket. */ 3251 dfa->has_mb_node = 1; 3252 br_token.type = COMPLEX_BRACKET; 3253 br_token.opr.mbcset = mbcset; 3254 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3255 if (BE (mbc_tree == NULL, 0)) 3256 goto parse_bracket_exp_espace; 3257 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3258 if (sbcset[sbc_idx]) 3259 break; 3260 /* If there are no bits set in sbcset, there is no point 3261 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3262 if (sbc_idx < BITSET_WORDS) 3263 { 3264 /* Build a tree for simple bracket. */ 3265 br_token.type = SIMPLE_BRACKET; 3266 br_token.opr.sbcset = sbcset; 3267 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3268 if (BE (work_tree == NULL, 0)) 3269 goto parse_bracket_exp_espace; 3270 3271 /* Then join them by ALT node. */ 3272 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3273 if (BE (work_tree == NULL, 0)) 3274 goto parse_bracket_exp_espace; 3275 } 3276 else 3277 { 3278 re_free (sbcset); 3279 work_tree = mbc_tree; 3280 } 3281 } 3282 else 3283#endif /* not RE_ENABLE_I18N */ 3284 { 3285#ifdef RE_ENABLE_I18N 3286 free_charset (mbcset); 3287#endif 3288 /* Build a tree for simple bracket. */ 3289 br_token.type = SIMPLE_BRACKET; 3290 br_token.opr.sbcset = sbcset; 3291 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3292 if (BE (work_tree == NULL, 0)) 3293 goto parse_bracket_exp_espace; 3294 } 3295 return work_tree; 3296 3297 parse_bracket_exp_espace: 3298 *err = REG_ESPACE; 3299 parse_bracket_exp_free_return: 3300 re_free (sbcset); 3301#ifdef RE_ENABLE_I18N 3302 free_charset (mbcset); 3303#endif /* RE_ENABLE_I18N */ 3304 return NULL; 3305} 3306 3307/* Parse an element in the bracket expression. */ 3308 3309static reg_errcode_t 3310parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3311 re_token_t *token, int token_len, re_dfa_t *dfa, 3312 reg_syntax_t syntax, int accept_hyphen) 3313{ 3314#ifdef RE_ENABLE_I18N 3315 int cur_char_size; 3316 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3317 if (cur_char_size > 1) 3318 { 3319 elem->type = MB_CHAR; 3320 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3321 re_string_skip_bytes (regexp, cur_char_size); 3322 return REG_NOERROR; 3323 } 3324#endif /* RE_ENABLE_I18N */ 3325 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3326 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3327 || token->type == OP_OPEN_EQUIV_CLASS) 3328 return parse_bracket_symbol (elem, regexp, token); 3329 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3330 { 3331 /* A '-' must only appear as anything but a range indicator before 3332 the closing bracket. Everything else is an error. */ 3333 re_token_t token2; 3334 (void) peek_token_bracket (&token2, regexp, syntax); 3335 if (token2.type != OP_CLOSE_BRACKET) 3336 /* The actual error value is not standardized since this whole 3337 case is undefined. But ERANGE makes good sense. */ 3338 return REG_ERANGE; 3339 } 3340 elem->type = SB_CHAR; 3341 elem->opr.ch = token->opr.c; 3342 return REG_NOERROR; 3343} 3344 3345/* Parse a bracket symbol in the bracket expression. Bracket symbols are 3346 such as [:<character_class>:], [.<collating_element>.], and 3347 [=<equivalent_class>=]. */ 3348 3349static reg_errcode_t 3350parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3351 re_token_t *token) 3352{ 3353 unsigned char ch, delim = token->opr.c; 3354 int i = 0; 3355 if (re_string_eoi(regexp)) 3356 return REG_EBRACK; 3357 for (;; ++i) 3358 { 3359 if (i >= BRACKET_NAME_BUF_SIZE) 3360 return REG_EBRACK; 3361 if (token->type == OP_OPEN_CHAR_CLASS) 3362 ch = re_string_fetch_byte_case (regexp); 3363 else 3364 ch = re_string_fetch_byte (regexp); 3365 if (re_string_eoi(regexp)) 3366 return REG_EBRACK; 3367 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3368 break; 3369 elem->opr.name[i] = ch; 3370 } 3371 re_string_skip_bytes (regexp, 1); 3372 elem->opr.name[i] = '\0'; 3373 switch (token->type) 3374 { 3375 case OP_OPEN_COLL_ELEM: 3376 elem->type = COLL_SYM; 3377 break; 3378 case OP_OPEN_EQUIV_CLASS: 3379 elem->type = EQUIV_CLASS; 3380 break; 3381 case OP_OPEN_CHAR_CLASS: 3382 elem->type = CHAR_CLASS; 3383 break; 3384 default: 3385 break; 3386 } 3387 return REG_NOERROR; 3388} 3389 3390 /* Helper function for parse_bracket_exp. 3391 Build the equivalence class which is represented by NAME. 3392 The result are written to MBCSET and SBCSET. 3393 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3394 is a pointer argument sinse we may update it. */ 3395 3396static reg_errcode_t 3397#ifdef RE_ENABLE_I18N 3398build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, 3399 int *equiv_class_alloc, const unsigned char *name) 3400#else /* not RE_ENABLE_I18N */ 3401build_equiv_class (bitset_t sbcset, const unsigned char *name) 3402#endif /* not RE_ENABLE_I18N */ 3403{ 3404#ifdef _LIBC 3405 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3406 if (nrules != 0) 3407 { 3408 const int32_t *table, *indirect; 3409 const unsigned char *weights, *extra, *cp; 3410 unsigned char char_buf[2]; 3411 int32_t idx1, idx2; 3412 unsigned int ch; 3413 size_t len; 3414 /* This #include defines a local function! */ 3415# include <locale/weight.h> 3416 /* Calculate the index for equivalence class. */ 3417 cp = name; 3418 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3419 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3420 _NL_COLLATE_WEIGHTMB); 3421 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3422 _NL_COLLATE_EXTRAMB); 3423 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3424 _NL_COLLATE_INDIRECTMB); 3425 idx1 = findidx (&cp, -1); 3426 if (BE (idx1 == 0 || *cp != '\0', 0)) 3427 /* This isn't a valid character. */ 3428 return REG_ECOLLATE; 3429 3430 /* Build single byte matcing table for this equivalence class. */ 3431 len = weights[idx1 & 0xffffff]; 3432 for (ch = 0; ch < SBC_MAX; ++ch) 3433 { 3434 char_buf[0] = ch; 3435 cp = char_buf; 3436 idx2 = findidx (&cp, 1); 3437/* 3438 idx2 = table[ch]; 3439*/ 3440 if (idx2 == 0) 3441 /* This isn't a valid character. */ 3442 continue; 3443 /* Compare only if the length matches and the collation rule 3444 index is the same. */ 3445 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3446 { 3447 int cnt = 0; 3448 3449 while (cnt <= len && 3450 weights[(idx1 & 0xffffff) + 1 + cnt] 3451 == weights[(idx2 & 0xffffff) + 1 + cnt]) 3452 ++cnt; 3453 3454 if (cnt > len) 3455 bitset_set (sbcset, ch); 3456 } 3457 } 3458 /* Check whether the array has enough space. */ 3459 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3460 { 3461 /* Not enough, realloc it. */ 3462 /* +1 in case of mbcset->nequiv_classes is 0. */ 3463 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3464 /* Use realloc since the array is NULL if *alloc == 0. */ 3465 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3466 int32_t, 3467 new_equiv_class_alloc); 3468 if (BE (new_equiv_classes == NULL, 0)) 3469 return REG_ESPACE; 3470 mbcset->equiv_classes = new_equiv_classes; 3471 *equiv_class_alloc = new_equiv_class_alloc; 3472 } 3473 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3474 } 3475 else 3476#endif /* _LIBC */ 3477 { 3478 if (BE (strlen ((const char *) name) != 1, 0)) 3479 return REG_ECOLLATE; 3480 bitset_set (sbcset, *name); 3481 } 3482 return REG_NOERROR; 3483} 3484 3485 /* Helper function for parse_bracket_exp. 3486 Build the character class which is represented by NAME. 3487 The result are written to MBCSET and SBCSET. 3488 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3489 is a pointer argument sinse we may update it. */ 3490 3491static reg_errcode_t 3492#ifdef RE_ENABLE_I18N 3493build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3494 re_charset_t *mbcset, int *char_class_alloc, 3495 const unsigned char *class_name, reg_syntax_t syntax) 3496#else /* not RE_ENABLE_I18N */ 3497build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3498 const unsigned char *class_name, reg_syntax_t syntax) 3499#endif /* not RE_ENABLE_I18N */ 3500{ 3501 int i; 3502 const char *name = (const char *) class_name; 3503 3504 /* In case of REG_ICASE "upper" and "lower" match the both of 3505 upper and lower cases. */ 3506 if ((syntax & RE_ICASE) 3507 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3508 name = "alpha"; 3509 3510#ifdef RE_ENABLE_I18N 3511 /* Check the space of the arrays. */ 3512 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3513 { 3514 /* Not enough, realloc it. */ 3515 /* +1 in case of mbcset->nchar_classes is 0. */ 3516 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3517 /* Use realloc since array is NULL if *alloc == 0. */ 3518 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3519 new_char_class_alloc); 3520 if (BE (new_char_classes == NULL, 0)) 3521 return REG_ESPACE; 3522 mbcset->char_classes = new_char_classes; 3523 *char_class_alloc = new_char_class_alloc; 3524 } 3525 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3526#endif /* RE_ENABLE_I18N */ 3527 3528#define BUILD_CHARCLASS_LOOP(ctype_func) \ 3529 do { \ 3530 if (BE (trans != NULL, 0)) \ 3531 { \ 3532 for (i = 0; i < SBC_MAX; ++i) \ 3533 if (ctype_func (i)) \ 3534 bitset_set (sbcset, trans[i]); \ 3535 } \ 3536 else \ 3537 { \ 3538 for (i = 0; i < SBC_MAX; ++i) \ 3539 if (ctype_func (i)) \ 3540 bitset_set (sbcset, i); \ 3541 } \ 3542 } while (0) 3543 3544 if (strcmp (name, "alnum") == 0) 3545 BUILD_CHARCLASS_LOOP (isalnum); 3546 else if (strcmp (name, "cntrl") == 0) 3547 BUILD_CHARCLASS_LOOP (iscntrl); 3548 else if (strcmp (name, "lower") == 0) 3549 BUILD_CHARCLASS_LOOP (islower); 3550 else if (strcmp (name, "space") == 0) 3551 BUILD_CHARCLASS_LOOP (isspace); 3552 else if (strcmp (name, "alpha") == 0) 3553 BUILD_CHARCLASS_LOOP (isalpha); 3554 else if (strcmp (name, "digit") == 0) 3555 BUILD_CHARCLASS_LOOP (isdigit); 3556 else if (strcmp (name, "print") == 0) 3557 BUILD_CHARCLASS_LOOP (isprint); 3558 else if (strcmp (name, "upper") == 0) 3559 BUILD_CHARCLASS_LOOP (isupper); 3560 else if (strcmp (name, "blank") == 0) 3561 BUILD_CHARCLASS_LOOP (isblank); 3562 else if (strcmp (name, "graph") == 0) 3563 BUILD_CHARCLASS_LOOP (isgraph); 3564 else if (strcmp (name, "punct") == 0) 3565 BUILD_CHARCLASS_LOOP (ispunct); 3566 else if (strcmp (name, "xdigit") == 0) 3567 BUILD_CHARCLASS_LOOP (isxdigit); 3568 else 3569 return REG_ECTYPE; 3570 3571 return REG_NOERROR; 3572} 3573 3574static bin_tree_t * 3575build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3576 const unsigned char *class_name, 3577 const unsigned char *extra, int non_match, 3578 reg_errcode_t *err) 3579{ 3580 re_bitset_ptr_t sbcset; 3581#ifdef RE_ENABLE_I18N 3582 re_charset_t *mbcset; 3583 int alloc = 0; 3584#endif /* not RE_ENABLE_I18N */ 3585 reg_errcode_t ret; 3586 re_token_t br_token; 3587 bin_tree_t *tree; 3588 3589 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3590#ifdef RE_ENABLE_I18N 3591 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3592#endif /* RE_ENABLE_I18N */ 3593 3594#ifdef RE_ENABLE_I18N 3595 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3596#else /* not RE_ENABLE_I18N */ 3597 if (BE (sbcset == NULL, 0)) 3598#endif /* not RE_ENABLE_I18N */ 3599 { 3600 *err = REG_ESPACE; 3601 return NULL; 3602 } 3603 3604 if (non_match) 3605 { 3606#ifdef RE_ENABLE_I18N 3607 mbcset->non_match = 1; 3608#endif /* not RE_ENABLE_I18N */ 3609 } 3610 3611 /* We don't care the syntax in this case. */ 3612 ret = build_charclass (trans, sbcset, 3613#ifdef RE_ENABLE_I18N 3614 mbcset, &alloc, 3615#endif /* RE_ENABLE_I18N */ 3616 class_name, 0); 3617 3618 if (BE (ret != REG_NOERROR, 0)) 3619 { 3620 re_free (sbcset); 3621#ifdef RE_ENABLE_I18N 3622 free_charset (mbcset); 3623#endif /* RE_ENABLE_I18N */ 3624 *err = ret; 3625 return NULL; 3626 } 3627 /* \w match '_' also. */ 3628 for (; *extra; extra++) 3629 bitset_set (sbcset, *extra); 3630 3631 /* If it is non-matching list. */ 3632 if (non_match) 3633 bitset_not (sbcset); 3634 3635#ifdef RE_ENABLE_I18N 3636 /* Ensure only single byte characters are set. */ 3637 if (dfa->mb_cur_max > 1) 3638 bitset_mask (sbcset, dfa->sb_char); 3639#endif 3640 3641 /* Build a tree for simple bracket. */ 3642 br_token.type = SIMPLE_BRACKET; 3643 br_token.opr.sbcset = sbcset; 3644 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3645 if (BE (tree == NULL, 0)) 3646 goto build_word_op_espace; 3647 3648#ifdef RE_ENABLE_I18N 3649 if (dfa->mb_cur_max > 1) 3650 { 3651 bin_tree_t *mbc_tree; 3652 /* Build a tree for complex bracket. */ 3653 br_token.type = COMPLEX_BRACKET; 3654 br_token.opr.mbcset = mbcset; 3655 dfa->has_mb_node = 1; 3656 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3657 if (BE (mbc_tree == NULL, 0)) 3658 goto build_word_op_espace; 3659 /* Then join them by ALT node. */ 3660 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3661 if (BE (mbc_tree != NULL, 1)) 3662 return tree; 3663 } 3664 else 3665 { 3666 free_charset (mbcset); 3667 return tree; 3668 } 3669#else /* not RE_ENABLE_I18N */ 3670 return tree; 3671#endif /* not RE_ENABLE_I18N */ 3672 3673 build_word_op_espace: 3674 re_free (sbcset); 3675#ifdef RE_ENABLE_I18N 3676 free_charset (mbcset); 3677#endif /* RE_ENABLE_I18N */ 3678 *err = REG_ESPACE; 3679 return NULL; 3680} 3681 3682/* This is intended for the expressions like "a{1,3}". 3683 Fetch a number from `input', and return the number. 3684 Return -1, if the number field is empty like "{,1}". 3685 Return -2, If an error is occured. */ 3686 3687static int 3688fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3689{ 3690 int num = -1; 3691 unsigned char c; 3692 while (1) 3693 { 3694 fetch_token (token, input, syntax); 3695 c = token->opr.c; 3696 if (BE (token->type == END_OF_RE, 0)) 3697 return -2; 3698 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3699 break; 3700 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) 3701 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0')); 3702 num = (num > RE_DUP_MAX) ? -2 : num; 3703 } 3704 return num; 3705} 3706 3707#ifdef RE_ENABLE_I18N 3708static void 3709free_charset (re_charset_t *cset) 3710{ 3711 re_free (cset->mbchars); 3712# ifdef _LIBC 3713 re_free (cset->coll_syms); 3714 re_free (cset->equiv_classes); 3715 re_free (cset->range_starts); 3716 re_free (cset->range_ends); 3717# endif 3718 re_free (cset->char_classes); 3719 re_free (cset); 3720} 3721#endif /* RE_ENABLE_I18N */ 3722 3723/* Functions for binary tree operation. */ 3724 3725/* Create a tree node. */ 3726 3727static bin_tree_t * 3728create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3729 re_token_type_t type) 3730{ 3731 re_token_t t; 3732 t.type = type; 3733 return create_token_tree (dfa, left, right, &t); 3734} 3735 3736static bin_tree_t * 3737create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3738 const re_token_t *token) 3739{ 3740 bin_tree_t *tree; 3741 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3742 { 3743 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3744 3745 if (storage == NULL) 3746 return NULL; 3747 storage->next = dfa->str_tree_storage; 3748 dfa->str_tree_storage = storage; 3749 dfa->str_tree_storage_idx = 0; 3750 } 3751 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3752 3753 tree->parent = NULL; 3754 tree->left = left; 3755 tree->right = right; 3756 tree->token = *token; 3757 tree->token.duplicated = 0; 3758 tree->token.opt_subexp = 0; 3759 tree->first = NULL; 3760 tree->next = NULL; 3761 tree->node_idx = -1; 3762 3763 if (left != NULL) 3764 left->parent = tree; 3765 if (right != NULL) 3766 right->parent = tree; 3767 return tree; 3768} 3769 3770/* Mark the tree SRC as an optional subexpression. 3771 To be called from preorder or postorder. */ 3772 3773static reg_errcode_t 3774mark_opt_subexp (void *extra, bin_tree_t *node) 3775{ 3776 int idx = (int) (long) extra; 3777 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3778 node->token.opt_subexp = 1; 3779 3780 return REG_NOERROR; 3781} 3782 3783/* Free the allocated memory inside NODE. */ 3784 3785static void 3786free_token (re_token_t *node) 3787{ 3788#ifdef RE_ENABLE_I18N 3789 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3790 free_charset (node->opr.mbcset); 3791 else 3792#endif /* RE_ENABLE_I18N */ 3793 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3794 re_free (node->opr.sbcset); 3795} 3796 3797/* Worker function for tree walking. Free the allocated memory inside NODE 3798 and its children. */ 3799 3800static reg_errcode_t 3801free_tree (void *extra, bin_tree_t *node) 3802{ 3803 free_token (&node->token); 3804 return REG_NOERROR; 3805} 3806 3807 3808/* Duplicate the node SRC, and return new node. This is a preorder 3809 visit similar to the one implemented by the generic visitor, but 3810 we need more infrastructure to maintain two parallel trees --- so, 3811 it's easier to duplicate. */ 3812 3813static bin_tree_t * 3814duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3815{ 3816 const bin_tree_t *node; 3817 bin_tree_t *dup_root; 3818 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3819 3820 for (node = root; ; ) 3821 { 3822 /* Create a new tree and link it back to the current parent. */ 3823 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3824 if (*p_new == NULL) 3825 return NULL; 3826 (*p_new)->parent = dup_node; 3827 (*p_new)->token.duplicated = 1; 3828 dup_node = *p_new; 3829 3830 /* Go to the left node, or up and to the right. */ 3831 if (node->left) 3832 { 3833 node = node->left; 3834 p_new = &dup_node->left; 3835 } 3836 else 3837 { 3838 const bin_tree_t *prev = NULL; 3839 while (node->right == prev || node->right == NULL) 3840 { 3841 prev = node; 3842 node = node->parent; 3843 dup_node = dup_node->parent; 3844 if (!node) 3845 return dup_root; 3846 } 3847 node = node->right; 3848 p_new = &dup_node->right; 3849 } 3850 } 3851} 3852