1/* -*- buffer-read-only: t -*- vi: set ro: */ 2/* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3#line 1 4/* Extended regular expression matching and search library. 5 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free 6 Software Foundation, Inc. 7 This file is part of the GNU C Library. 8 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 9 10 This program is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 3, or (at your option) 13 any later version. 14 15 This program is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License along 21 with this program; if not, write to the Free Software Foundation, 22 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 23 24#include "intprops.h" 25#include "verify.h" 26static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 27 size_t length, reg_syntax_t syntax); 28static void re_compile_fastmap_iter (regex_t *bufp, 29 const re_dfastate_t *init_state, 30 char *fastmap); 31static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 32#ifdef RE_ENABLE_I18N 33static void free_charset (re_charset_t *cset); 34#endif /* RE_ENABLE_I18N */ 35static void free_workarea_compile (regex_t *preg); 36static reg_errcode_t create_initial_state (re_dfa_t *dfa); 37#ifdef RE_ENABLE_I18N 38static void optimize_utf8 (re_dfa_t *dfa); 39#endif 40static reg_errcode_t analyze (regex_t *preg); 41static reg_errcode_t preorder (bin_tree_t *root, 42 reg_errcode_t (fn (void *, bin_tree_t *)), 43 void *extra); 44static reg_errcode_t postorder (bin_tree_t *root, 45 reg_errcode_t (fn (void *, bin_tree_t *)), 46 void *extra); 47static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 48static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 49static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 50 bin_tree_t *node); 51static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 52static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 53static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 54static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 55static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 56 unsigned int constraint); 57static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 58static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 59 Idx node, bool root); 60static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 61static Idx fetch_number (re_string_t *input, re_token_t *token, 62 reg_syntax_t syntax); 63static int peek_token (re_token_t *token, re_string_t *input, 64 reg_syntax_t syntax) internal_function; 65static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 66 reg_syntax_t syntax, reg_errcode_t *err); 67static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 68 re_token_t *token, reg_syntax_t syntax, 69 Idx nest, reg_errcode_t *err); 70static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 71 re_token_t *token, reg_syntax_t syntax, 72 Idx nest, reg_errcode_t *err); 73static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 74 re_token_t *token, reg_syntax_t syntax, 75 Idx nest, reg_errcode_t *err); 76static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 77 re_token_t *token, reg_syntax_t syntax, 78 Idx nest, reg_errcode_t *err); 79static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 80 re_dfa_t *dfa, re_token_t *token, 81 reg_syntax_t syntax, reg_errcode_t *err); 82static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 83 re_token_t *token, reg_syntax_t syntax, 84 reg_errcode_t *err); 85static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 86 re_string_t *regexp, 87 re_token_t *token, int token_len, 88 re_dfa_t *dfa, 89 reg_syntax_t syntax, 90 bool accept_hyphen); 91static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 92 re_string_t *regexp, 93 re_token_t *token); 94#ifdef RE_ENABLE_I18N 95static reg_errcode_t build_equiv_class (bitset_t sbcset, 96 re_charset_t *mbcset, 97 Idx *equiv_class_alloc, 98 const unsigned char *name); 99static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 100 bitset_t sbcset, 101 re_charset_t *mbcset, 102 Idx *char_class_alloc, 103 const unsigned char *class_name, 104 reg_syntax_t syntax); 105#else /* not RE_ENABLE_I18N */ 106static reg_errcode_t build_equiv_class (bitset_t sbcset, 107 const unsigned char *name); 108static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 109 bitset_t sbcset, 110 const unsigned char *class_name, 111 reg_syntax_t syntax); 112#endif /* not RE_ENABLE_I18N */ 113static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 114 RE_TRANSLATE_TYPE trans, 115 const unsigned char *class_name, 116 const unsigned char *extra, 117 bool non_match, reg_errcode_t *err); 118static bin_tree_t *create_tree (re_dfa_t *dfa, 119 bin_tree_t *left, bin_tree_t *right, 120 re_token_type_t type); 121static bin_tree_t *create_token_tree (re_dfa_t *dfa, 122 bin_tree_t *left, bin_tree_t *right, 123 const re_token_t *token); 124static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 125static void free_token (re_token_t *node); 126static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 127static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 128 129/* This table gives an error message for each of the error codes listed 130 in regex.h. Obviously the order here has to be same as there. 131 POSIX doesn't require that we do anything for REG_NOERROR, 132 but why not be nice? */ 133 134static const char __re_error_msgid[] = 135 { 136#define REG_NOERROR_IDX 0 137 gettext_noop ("Success") /* REG_NOERROR */ 138 "\0" 139#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 140 gettext_noop ("No match") /* REG_NOMATCH */ 141 "\0" 142#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 143 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 144 "\0" 145#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 146 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 147 "\0" 148#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 149 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 150 "\0" 151#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 152 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 153 "\0" 154#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 155 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 156 "\0" 157#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 158 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 159 "\0" 160#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 161 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 162 "\0" 163#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 164 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 165 "\0" 166#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 167 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 168 "\0" 169#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 170 gettext_noop ("Invalid range end") /* REG_ERANGE */ 171 "\0" 172#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 173 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 174 "\0" 175#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 176 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 177 "\0" 178#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 179 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 180 "\0" 181#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 182 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 183 "\0" 184#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 185 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 186 }; 187 188static const size_t __re_error_msgid_idx[] = 189 { 190 REG_NOERROR_IDX, 191 REG_NOMATCH_IDX, 192 REG_BADPAT_IDX, 193 REG_ECOLLATE_IDX, 194 REG_ECTYPE_IDX, 195 REG_EESCAPE_IDX, 196 REG_ESUBREG_IDX, 197 REG_EBRACK_IDX, 198 REG_EPAREN_IDX, 199 REG_EBRACE_IDX, 200 REG_BADBR_IDX, 201 REG_ERANGE_IDX, 202 REG_ESPACE_IDX, 203 REG_BADRPT_IDX, 204 REG_EEND_IDX, 205 REG_ESIZE_IDX, 206 REG_ERPAREN_IDX 207 }; 208 209/* Entry points for GNU code. */ 210 211/* re_compile_pattern is the GNU regular expression compiler: it 212 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 213 Returns 0 if the pattern was valid, otherwise an error string. 214 215 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 216 are set in BUFP on entry. */ 217 218#ifdef _LIBC 219const char * 220re_compile_pattern (pattern, length, bufp) 221 const char *pattern; 222 size_t length; 223 struct re_pattern_buffer *bufp; 224#else /* size_t might promote */ 225const char * 226re_compile_pattern (const char *pattern, size_t length, 227 struct re_pattern_buffer *bufp) 228#endif 229{ 230 reg_errcode_t ret; 231 232 /* And GNU code determines whether or not to get register information 233 by passing null for the REGS argument to re_match, etc., not by 234 setting no_sub, unless RE_NO_SUB is set. */ 235 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 236 237 /* Match anchors at newline. */ 238 bufp->newline_anchor = 1; 239 240 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 241 242 if (!ret) 243 return NULL; 244 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 245} 246#ifdef _LIBC 247weak_alias (__re_compile_pattern, re_compile_pattern) 248#endif 249 250/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 251 also be assigned to arbitrarily: each pattern buffer stores its own 252 syntax, so it can be changed between regex compilations. */ 253/* This has no initializer because initialized variables in Emacs 254 become read-only after dumping. */ 255reg_syntax_t re_syntax_options; 256 257 258/* Specify the precise syntax of regexps for compilation. This provides 259 for compatibility for various utilities which historically have 260 different, incompatible syntaxes. 261 262 The argument SYNTAX is a bit mask comprised of the various bits 263 defined in regex.h. We return the old syntax. */ 264 265reg_syntax_t 266re_set_syntax (syntax) 267 reg_syntax_t syntax; 268{ 269 reg_syntax_t ret = re_syntax_options; 270 271 re_syntax_options = syntax; 272 return ret; 273} 274#ifdef _LIBC 275weak_alias (__re_set_syntax, re_set_syntax) 276#endif 277 278int 279re_compile_fastmap (bufp) 280 struct re_pattern_buffer *bufp; 281{ 282 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 283 char *fastmap = bufp->fastmap; 284 285 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 286 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 287 if (dfa->init_state != dfa->init_state_word) 288 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 289 if (dfa->init_state != dfa->init_state_nl) 290 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 291 if (dfa->init_state != dfa->init_state_begbuf) 292 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 293 bufp->fastmap_accurate = 1; 294 return 0; 295} 296#ifdef _LIBC 297weak_alias (__re_compile_fastmap, re_compile_fastmap) 298#endif 299 300static inline void 301__attribute ((always_inline)) 302re_set_fastmap (char *fastmap, bool icase, int ch) 303{ 304 fastmap[ch] = 1; 305 if (icase) 306 fastmap[tolower (ch)] = 1; 307} 308 309/* Helper function for re_compile_fastmap. 310 Compile fastmap for the initial_state INIT_STATE. */ 311 312static void 313re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 314 char *fastmap) 315{ 316 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 317 Idx node_cnt; 318 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 319 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 320 { 321 Idx node = init_state->nodes.elems[node_cnt]; 322 re_token_type_t type = dfa->nodes[node].type; 323 324 if (type == CHARACTER) 325 { 326 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 327#ifdef RE_ENABLE_I18N 328 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 329 { 330 unsigned char buf[MB_LEN_MAX]; 331 unsigned char *p; 332 wchar_t wc; 333 mbstate_t state; 334 335 p = buf; 336 *p++ = dfa->nodes[node].opr.c; 337 while (++node < dfa->nodes_len 338 && dfa->nodes[node].type == CHARACTER 339 && dfa->nodes[node].mb_partial) 340 *p++ = dfa->nodes[node].opr.c; 341 memset (&state, '\0', sizeof (state)); 342 if (__mbrtowc (&wc, (const char *) buf, p - buf, 343 &state) == p - buf 344 && (__wcrtomb ((char *) buf, towlower (wc), &state) 345 != (size_t) -1)) 346 re_set_fastmap (fastmap, false, buf[0]); 347 } 348#endif 349 } 350 else if (type == SIMPLE_BRACKET) 351 { 352 int i, ch; 353 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 354 { 355 int j; 356 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 357 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 358 if (w & ((bitset_word_t) 1 << j)) 359 re_set_fastmap (fastmap, icase, ch); 360 } 361 } 362#ifdef RE_ENABLE_I18N 363 else if (type == COMPLEX_BRACKET) 364 { 365 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 366 Idx i; 367 368# ifdef _LIBC 369 /* See if we have to try all bytes which start multiple collation 370 elements. 371 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 372 collation element, and don't catch 'b' since 'b' is 373 the only collation element which starts from 'b' (and 374 it is caught by SIMPLE_BRACKET). */ 375 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 376 && (cset->ncoll_syms || cset->nranges)) 377 { 378 const int32_t *table = (const int32_t *) 379 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 380 for (i = 0; i < SBC_MAX; ++i) 381 if (table[i] < 0) 382 re_set_fastmap (fastmap, icase, i); 383 } 384# endif /* _LIBC */ 385 386 /* See if we have to start the match at all multibyte characters, 387 i.e. where we would not find an invalid sequence. This only 388 applies to multibyte character sets; for single byte character 389 sets, the SIMPLE_BRACKET again suffices. */ 390 if (dfa->mb_cur_max > 1 391 && (cset->nchar_classes || cset->non_match || cset->nranges 392# ifdef _LIBC 393 || cset->nequiv_classes 394# endif /* _LIBC */ 395 )) 396 { 397 unsigned char c = 0; 398 do 399 { 400 mbstate_t mbs; 401 memset (&mbs, 0, sizeof (mbs)); 402 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 403 re_set_fastmap (fastmap, false, (int) c); 404 } 405 while (++c != 0); 406 } 407 408 else 409 { 410 /* ... Else catch all bytes which can start the mbchars. */ 411 for (i = 0; i < cset->nmbchars; ++i) 412 { 413 char buf[256]; 414 mbstate_t state; 415 memset (&state, '\0', sizeof (state)); 416 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 417 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 418 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 419 { 420 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 421 != (size_t) -1) 422 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 423 } 424 } 425 } 426 } 427#endif /* RE_ENABLE_I18N */ 428 else if (type == OP_PERIOD 429#ifdef RE_ENABLE_I18N 430 || type == OP_UTF8_PERIOD 431#endif /* RE_ENABLE_I18N */ 432 || type == END_OF_RE) 433 { 434 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 435 if (type == END_OF_RE) 436 bufp->can_be_null = 1; 437 return; 438 } 439 } 440} 441 442/* Entry point for POSIX code. */ 443/* regcomp takes a regular expression as a string and compiles it. 444 445 PREG is a regex_t *. We do not expect any fields to be initialized, 446 since POSIX says we shouldn't. Thus, we set 447 448 `buffer' to the compiled pattern; 449 `used' to the length of the compiled pattern; 450 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 451 REG_EXTENDED bit in CFLAGS is set; otherwise, to 452 RE_SYNTAX_POSIX_BASIC; 453 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 454 `fastmap' to an allocated space for the fastmap; 455 `fastmap_accurate' to zero; 456 `re_nsub' to the number of subexpressions in PATTERN. 457 458 PATTERN is the address of the pattern string. 459 460 CFLAGS is a series of bits which affect compilation. 461 462 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 463 use POSIX basic syntax. 464 465 If REG_NEWLINE is set, then . and [^...] don't match newline. 466 Also, regexec will try a match beginning after every newline. 467 468 If REG_ICASE is set, then we considers upper- and lowercase 469 versions of letters to be equivalent when matching. 470 471 If REG_NOSUB is set, then when PREG is passed to regexec, that 472 routine will report only success or failure, and nothing about the 473 registers. 474 475 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 476 the return codes and their meanings.) */ 477 478int 479regcomp (preg, pattern, cflags) 480 regex_t *_Restrict_ preg; 481 const char *_Restrict_ pattern; 482 int cflags; 483{ 484 reg_errcode_t ret; 485 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 486 : RE_SYNTAX_POSIX_BASIC); 487 488 preg->buffer = NULL; 489 preg->allocated = 0; 490 preg->used = 0; 491 492 /* Try to allocate space for the fastmap. */ 493 preg->fastmap = re_malloc (char, SBC_MAX); 494 if (BE (preg->fastmap == NULL, 0)) 495 return REG_ESPACE; 496 497 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 498 499 /* If REG_NEWLINE is set, newlines are treated differently. */ 500 if (cflags & REG_NEWLINE) 501 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 502 syntax &= ~RE_DOT_NEWLINE; 503 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 504 /* It also changes the matching behavior. */ 505 preg->newline_anchor = 1; 506 } 507 else 508 preg->newline_anchor = 0; 509 preg->no_sub = !!(cflags & REG_NOSUB); 510 preg->translate = NULL; 511 512 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 513 514 /* POSIX doesn't distinguish between an unmatched open-group and an 515 unmatched close-group: both are REG_EPAREN. */ 516 if (ret == REG_ERPAREN) 517 ret = REG_EPAREN; 518 519 /* We have already checked preg->fastmap != NULL. */ 520 if (BE (ret == REG_NOERROR, 1)) 521 /* Compute the fastmap now, since regexec cannot modify the pattern 522 buffer. This function never fails in this implementation. */ 523 (void) re_compile_fastmap (preg); 524 else 525 { 526 /* Some error occurred while compiling the expression. */ 527 re_free (preg->fastmap); 528 preg->fastmap = NULL; 529 } 530 531 return (int) ret; 532} 533#ifdef _LIBC 534weak_alias (__regcomp, regcomp) 535#endif 536 537/* Returns a message corresponding to an error code, ERRCODE, returned 538 from either regcomp or regexec. We don't use PREG here. */ 539 540#ifdef _LIBC 541size_t 542regerror (errcode, preg, errbuf, errbuf_size) 543 int errcode; 544 const regex_t *_Restrict_ preg; 545 char *_Restrict_ errbuf; 546 size_t errbuf_size; 547#else /* size_t might promote */ 548size_t 549regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_, 550 char *_Restrict_ errbuf, size_t errbuf_size) 551#endif 552{ 553 const char *msg; 554 size_t msg_size; 555 556 if (BE (errcode < 0 557 || errcode >= (int) (sizeof (__re_error_msgid_idx) 558 / sizeof (__re_error_msgid_idx[0])), 0)) 559 /* Only error codes returned by the rest of the code should be passed 560 to this routine. If we are given anything else, or if other regex 561 code generates an invalid error code, then the program has a bug. 562 Dump core so we can fix it. */ 563 abort (); 564 565 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 566 567 msg_size = strlen (msg) + 1; /* Includes the null. */ 568 569 if (BE (errbuf_size != 0, 1)) 570 { 571 size_t cpy_size = msg_size; 572 if (BE (msg_size > errbuf_size, 0)) 573 { 574 cpy_size = errbuf_size - 1; 575 errbuf[cpy_size] = '\0'; 576 } 577 memcpy (errbuf, msg, cpy_size); 578 } 579 580 return msg_size; 581} 582#ifdef _LIBC 583weak_alias (__regerror, regerror) 584#endif 585 586 587#ifdef RE_ENABLE_I18N 588/* This static array is used for the map to single-byte characters when 589 UTF-8 is used. Otherwise we would allocate memory just to initialize 590 it the same all the time. UTF-8 is the preferred encoding so this is 591 a worthwhile optimization. */ 592static const bitset_t utf8_sb_map = 593{ 594 /* Set the first 128 bits. */ 595# if 4 * BITSET_WORD_BITS < ASCII_CHARS 596# error "bitset_word_t is narrower than 32 bits" 597# elif 3 * BITSET_WORD_BITS < ASCII_CHARS 598 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 599# elif 2 * BITSET_WORD_BITS < ASCII_CHARS 600 BITSET_WORD_MAX, BITSET_WORD_MAX, 601# elif 1 * BITSET_WORD_BITS < ASCII_CHARS 602 BITSET_WORD_MAX, 603# endif 604 (BITSET_WORD_MAX 605 >> (SBC_MAX % BITSET_WORD_BITS == 0 606 ? 0 607 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 608}; 609#endif 610 611 612static void 613free_dfa_content (re_dfa_t *dfa) 614{ 615 Idx i, j; 616 617 if (dfa->nodes) 618 for (i = 0; i < dfa->nodes_len; ++i) 619 free_token (dfa->nodes + i); 620 re_free (dfa->nexts); 621 for (i = 0; i < dfa->nodes_len; ++i) 622 { 623 if (dfa->eclosures != NULL) 624 re_node_set_free (dfa->eclosures + i); 625 if (dfa->inveclosures != NULL) 626 re_node_set_free (dfa->inveclosures + i); 627 if (dfa->edests != NULL) 628 re_node_set_free (dfa->edests + i); 629 } 630 re_free (dfa->edests); 631 re_free (dfa->eclosures); 632 re_free (dfa->inveclosures); 633 re_free (dfa->nodes); 634 635 if (dfa->state_table) 636 for (i = 0; i <= dfa->state_hash_mask; ++i) 637 { 638 struct re_state_table_entry *entry = dfa->state_table + i; 639 for (j = 0; j < entry->num; ++j) 640 { 641 re_dfastate_t *state = entry->array[j]; 642 free_state (state); 643 } 644 re_free (entry->array); 645 } 646 re_free (dfa->state_table); 647#ifdef RE_ENABLE_I18N 648 if (dfa->sb_char != utf8_sb_map) 649 re_free (dfa->sb_char); 650#endif 651 re_free (dfa->subexp_map); 652#ifdef DEBUG 653 re_free (dfa->re_str); 654#endif 655 656 re_free (dfa); 657} 658 659 660/* Free dynamically allocated space used by PREG. */ 661 662void 663regfree (preg) 664 regex_t *preg; 665{ 666 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 667 if (BE (dfa != NULL, 1)) 668 free_dfa_content (dfa); 669 preg->buffer = NULL; 670 preg->allocated = 0; 671 672 re_free (preg->fastmap); 673 preg->fastmap = NULL; 674 675 re_free (preg->translate); 676 preg->translate = NULL; 677} 678#ifdef _LIBC 679weak_alias (__regfree, regfree) 680#endif 681 682/* Entry points compatible with 4.2 BSD regex library. We don't define 683 them unless specifically requested. */ 684 685#if defined _REGEX_RE_COMP || defined _LIBC 686 687/* BSD has one and only one pattern buffer. */ 688static struct re_pattern_buffer re_comp_buf; 689 690char * 691# ifdef _LIBC 692/* Make these definitions weak in libc, so POSIX programs can redefine 693 these names if they don't use our functions, and still use 694 regcomp/regexec above without link errors. */ 695weak_function 696# endif 697re_comp (s) 698 const char *s; 699{ 700 reg_errcode_t ret; 701 char *fastmap; 702 703 if (!s) 704 { 705 if (!re_comp_buf.buffer) 706 return gettext ("No previous regular expression"); 707 return 0; 708 } 709 710 if (re_comp_buf.buffer) 711 { 712 fastmap = re_comp_buf.fastmap; 713 re_comp_buf.fastmap = NULL; 714 __regfree (&re_comp_buf); 715 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 716 re_comp_buf.fastmap = fastmap; 717 } 718 719 if (re_comp_buf.fastmap == NULL) 720 { 721 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 722 if (re_comp_buf.fastmap == NULL) 723 return (char *) gettext (__re_error_msgid 724 + __re_error_msgid_idx[(int) REG_ESPACE]); 725 } 726 727 /* Since `re_exec' always passes NULL for the `regs' argument, we 728 don't need to initialize the pattern buffer fields which affect it. */ 729 730 /* Match anchors at newlines. */ 731 re_comp_buf.newline_anchor = 1; 732 733 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 734 735 if (!ret) 736 return NULL; 737 738 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 739 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 740} 741 742#ifdef _LIBC 743libc_freeres_fn (free_mem) 744{ 745 __regfree (&re_comp_buf); 746} 747#endif 748 749#endif /* _REGEX_RE_COMP */ 750 751/* Internal entry point. 752 Compile the regular expression PATTERN, whose length is LENGTH. 753 SYNTAX indicate regular expression's syntax. */ 754 755static reg_errcode_t 756re_compile_internal (regex_t *preg, const char * pattern, size_t length, 757 reg_syntax_t syntax) 758{ 759 reg_errcode_t err = REG_NOERROR; 760 re_dfa_t *dfa; 761 re_string_t regexp; 762 763 /* Initialize the pattern buffer. */ 764 preg->fastmap_accurate = 0; 765 preg->syntax = syntax; 766 preg->not_bol = preg->not_eol = 0; 767 preg->used = 0; 768 preg->re_nsub = 0; 769 preg->can_be_null = 0; 770 preg->regs_allocated = REGS_UNALLOCATED; 771 772 /* Initialize the dfa. */ 773 dfa = (re_dfa_t *) preg->buffer; 774 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 775 { 776 /* If zero allocated, but buffer is non-null, try to realloc 777 enough space. This loses if buffer's address is bogus, but 778 that is the user's responsibility. If ->buffer is NULL this 779 is a simple allocation. */ 780 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 781 if (dfa == NULL) 782 return REG_ESPACE; 783 preg->allocated = sizeof (re_dfa_t); 784 preg->buffer = (unsigned char *) dfa; 785 } 786 preg->used = sizeof (re_dfa_t); 787 788 err = init_dfa (dfa, length); 789 if (BE (err != REG_NOERROR, 0)) 790 { 791 free_dfa_content (dfa); 792 preg->buffer = NULL; 793 preg->allocated = 0; 794 return err; 795 } 796#ifdef DEBUG 797 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 798 dfa->re_str = re_malloc (char, length + 1); 799 strncpy (dfa->re_str, pattern, length + 1); 800#endif 801 802 __libc_lock_init (dfa->lock); 803 804 err = re_string_construct (®exp, pattern, length, preg->translate, 805 (syntax & RE_ICASE) != 0, dfa); 806 if (BE (err != REG_NOERROR, 0)) 807 { 808 re_compile_internal_free_return: 809 free_workarea_compile (preg); 810 re_string_destruct (®exp); 811 free_dfa_content (dfa); 812 preg->buffer = NULL; 813 preg->allocated = 0; 814 return err; 815 } 816 817 /* Parse the regular expression, and build a structure tree. */ 818 preg->re_nsub = 0; 819 dfa->str_tree = parse (®exp, preg, syntax, &err); 820 if (BE (dfa->str_tree == NULL, 0)) 821 goto re_compile_internal_free_return; 822 823 /* Analyze the tree and create the nfa. */ 824 err = analyze (preg); 825 if (BE (err != REG_NOERROR, 0)) 826 goto re_compile_internal_free_return; 827 828#ifdef RE_ENABLE_I18N 829 /* If possible, do searching in single byte encoding to speed things up. */ 830 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 831 optimize_utf8 (dfa); 832#endif 833 834 /* Then create the initial state of the dfa. */ 835 err = create_initial_state (dfa); 836 837 /* Release work areas. */ 838 free_workarea_compile (preg); 839 re_string_destruct (®exp); 840 841 if (BE (err != REG_NOERROR, 0)) 842 { 843 free_dfa_content (dfa); 844 preg->buffer = NULL; 845 preg->allocated = 0; 846 } 847 848 return err; 849} 850 851/* Initialize DFA. We use the length of the regular expression PAT_LEN 852 as the initial length of some arrays. */ 853 854static reg_errcode_t 855init_dfa (re_dfa_t *dfa, size_t pat_len) 856{ 857 __re_size_t table_size; 858#ifndef _LIBC 859 char *codeset_name; 860#endif 861#ifdef RE_ENABLE_I18N 862 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); 863#else 864 size_t max_i18n_object_size = 0; 865#endif 866 size_t max_object_size = 867 MAX (sizeof (struct re_state_table_entry), 868 MAX (sizeof (re_token_t), 869 MAX (sizeof (re_node_set), 870 MAX (sizeof (regmatch_t), 871 max_i18n_object_size)))); 872 873 memset (dfa, '\0', sizeof (re_dfa_t)); 874 875 /* Force allocation of str_tree_storage the first time. */ 876 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 877 878 /* Avoid overflows. The extra "/ 2" is for the table_size doubling 879 calculation below, and for similar doubling calculations 880 elsewhere. And it's <= rather than <, because some of the 881 doubling calculations add 1 afterwards. */ 882 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) 883 return REG_ESPACE; 884 885 dfa->nodes_alloc = pat_len + 1; 886 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 887 888 /* table_size = 2 ^ ceil(log pat_len) */ 889 for (table_size = 1; ; table_size <<= 1) 890 if (table_size > pat_len) 891 break; 892 893 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 894 dfa->state_hash_mask = table_size - 1; 895 896 dfa->mb_cur_max = MB_CUR_MAX; 897#ifdef _LIBC 898 if (dfa->mb_cur_max == 6 899 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 900 dfa->is_utf8 = 1; 901 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 902 != 0); 903#else 904 codeset_name = nl_langinfo (CODESET); 905 if (strcasecmp (codeset_name, "UTF-8") == 0 906 || strcasecmp (codeset_name, "UTF8") == 0) 907 dfa->is_utf8 = 1; 908 909 /* We check exhaustively in the loop below if this charset is a 910 superset of ASCII. */ 911 dfa->map_notascii = 0; 912#endif 913 914#ifdef RE_ENABLE_I18N 915 if (dfa->mb_cur_max > 1) 916 { 917 if (dfa->is_utf8) 918 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 919 else 920 { 921 int i, j, ch; 922 923 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 924 if (BE (dfa->sb_char == NULL, 0)) 925 return REG_ESPACE; 926 927 /* Set the bits corresponding to single byte chars. */ 928 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 929 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 930 { 931 wint_t wch = __btowc (ch); 932 if (wch != WEOF) 933 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 934# ifndef _LIBC 935 if (isascii (ch) && wch != ch) 936 dfa->map_notascii = 1; 937# endif 938 } 939 } 940 } 941#endif 942 943 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 944 return REG_ESPACE; 945 return REG_NOERROR; 946} 947 948/* Initialize WORD_CHAR table, which indicate which character is 949 "word". In this case "word" means that it is the word construction 950 character used by some operators like "\<", "\>", etc. */ 951 952static void 953internal_function 954init_word_char (re_dfa_t *dfa) 955{ 956 int i, j, ch; 957 dfa->word_ops_used = 1; 958 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 959 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 960 if (isalnum (ch) || ch == '_') 961 dfa->word_char[i] |= (bitset_word_t) 1 << j; 962} 963 964/* Free the work area which are only used while compiling. */ 965 966static void 967free_workarea_compile (regex_t *preg) 968{ 969 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 970 bin_tree_storage_t *storage, *next; 971 for (storage = dfa->str_tree_storage; storage; storage = next) 972 { 973 next = storage->next; 974 re_free (storage); 975 } 976 dfa->str_tree_storage = NULL; 977 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 978 dfa->str_tree = NULL; 979 re_free (dfa->org_indices); 980 dfa->org_indices = NULL; 981} 982 983/* Create initial states for all contexts. */ 984 985static reg_errcode_t 986create_initial_state (re_dfa_t *dfa) 987{ 988 Idx first, i; 989 reg_errcode_t err; 990 re_node_set init_nodes; 991 992 /* Initial states have the epsilon closure of the node which is 993 the first node of the regular expression. */ 994 first = dfa->str_tree->first->node_idx; 995 dfa->init_node = first; 996 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 997 if (BE (err != REG_NOERROR, 0)) 998 return err; 999 1000 /* The back-references which are in initial states can epsilon transit, 1001 since in this case all of the subexpressions can be null. 1002 Then we add epsilon closures of the nodes which are the next nodes of 1003 the back-references. */ 1004 if (dfa->nbackref > 0) 1005 for (i = 0; i < init_nodes.nelem; ++i) 1006 { 1007 Idx node_idx = init_nodes.elems[i]; 1008 re_token_type_t type = dfa->nodes[node_idx].type; 1009 1010 Idx clexp_idx; 1011 if (type != OP_BACK_REF) 1012 continue; 1013 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1014 { 1015 re_token_t *clexp_node; 1016 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1017 if (clexp_node->type == OP_CLOSE_SUBEXP 1018 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1019 break; 1020 } 1021 if (clexp_idx == init_nodes.nelem) 1022 continue; 1023 1024 if (type == OP_BACK_REF) 1025 { 1026 Idx dest_idx = dfa->edests[node_idx].elems[0]; 1027 if (!re_node_set_contains (&init_nodes, dest_idx)) 1028 { 1029 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1030 i = 0; 1031 } 1032 } 1033 } 1034 1035 /* It must be the first time to invoke acquire_state. */ 1036 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1037 /* We don't check ERR here, since the initial state must not be NULL. */ 1038 if (BE (dfa->init_state == NULL, 0)) 1039 return err; 1040 if (dfa->init_state->has_constraint) 1041 { 1042 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1043 CONTEXT_WORD); 1044 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1045 CONTEXT_NEWLINE); 1046 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1047 &init_nodes, 1048 CONTEXT_NEWLINE 1049 | CONTEXT_BEGBUF); 1050 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1051 || dfa->init_state_begbuf == NULL, 0)) 1052 return err; 1053 } 1054 else 1055 dfa->init_state_word = dfa->init_state_nl 1056 = dfa->init_state_begbuf = dfa->init_state; 1057 1058 re_node_set_free (&init_nodes); 1059 return REG_NOERROR; 1060} 1061 1062#ifdef RE_ENABLE_I18N 1063/* If it is possible to do searching in single byte encoding instead of UTF-8 1064 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1065 DFA nodes where needed. */ 1066 1067static void 1068optimize_utf8 (re_dfa_t *dfa) 1069{ 1070 Idx node; 1071 int i; 1072 bool mb_chars = false; 1073 bool has_period = false; 1074 1075 for (node = 0; node < dfa->nodes_len; ++node) 1076 switch (dfa->nodes[node].type) 1077 { 1078 case CHARACTER: 1079 if (dfa->nodes[node].opr.c >= ASCII_CHARS) 1080 mb_chars = true; 1081 break; 1082 case ANCHOR: 1083 switch (dfa->nodes[node].opr.ctx_type) 1084 { 1085 case LINE_FIRST: 1086 case LINE_LAST: 1087 case BUF_FIRST: 1088 case BUF_LAST: 1089 break; 1090 default: 1091 /* Word anchors etc. cannot be handled. It's okay to test 1092 opr.ctx_type since constraints (for all DFA nodes) are 1093 created by ORing one or more opr.ctx_type values. */ 1094 return; 1095 } 1096 break; 1097 case OP_PERIOD: 1098 has_period = true; 1099 break; 1100 case OP_BACK_REF: 1101 case OP_ALT: 1102 case END_OF_RE: 1103 case OP_DUP_ASTERISK: 1104 case OP_OPEN_SUBEXP: 1105 case OP_CLOSE_SUBEXP: 1106 break; 1107 case COMPLEX_BRACKET: 1108 return; 1109 case SIMPLE_BRACKET: 1110 /* Just double check. */ 1111 { 1112 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 1113 ? 0 1114 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); 1115 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1116 { 1117 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1118 return; 1119 rshift = 0; 1120 } 1121 } 1122 break; 1123 default: 1124 abort (); 1125 } 1126 1127 if (mb_chars || has_period) 1128 for (node = 0; node < dfa->nodes_len; ++node) 1129 { 1130 if (dfa->nodes[node].type == CHARACTER 1131 && dfa->nodes[node].opr.c >= ASCII_CHARS) 1132 dfa->nodes[node].mb_partial = 0; 1133 else if (dfa->nodes[node].type == OP_PERIOD) 1134 dfa->nodes[node].type = OP_UTF8_PERIOD; 1135 } 1136 1137 /* The search can be in single byte locale. */ 1138 dfa->mb_cur_max = 1; 1139 dfa->is_utf8 = 0; 1140 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1141} 1142#endif 1143 1144/* Analyze the structure tree, and calculate "first", "next", "edest", 1145 "eclosure", and "inveclosure". */ 1146 1147static reg_errcode_t 1148analyze (regex_t *preg) 1149{ 1150 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1151 reg_errcode_t ret; 1152 1153 /* Allocate arrays. */ 1154 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1155 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1156 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1157 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1158 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1159 || dfa->eclosures == NULL, 0)) 1160 return REG_ESPACE; 1161 1162 dfa->subexp_map = re_malloc (Idx, preg->re_nsub); 1163 if (dfa->subexp_map != NULL) 1164 { 1165 Idx i; 1166 for (i = 0; i < preg->re_nsub; i++) 1167 dfa->subexp_map[i] = i; 1168 preorder (dfa->str_tree, optimize_subexps, dfa); 1169 for (i = 0; i < preg->re_nsub; i++) 1170 if (dfa->subexp_map[i] != i) 1171 break; 1172 if (i == preg->re_nsub) 1173 { 1174 free (dfa->subexp_map); 1175 dfa->subexp_map = NULL; 1176 } 1177 } 1178 1179 ret = postorder (dfa->str_tree, lower_subexps, preg); 1180 if (BE (ret != REG_NOERROR, 0)) 1181 return ret; 1182 ret = postorder (dfa->str_tree, calc_first, dfa); 1183 if (BE (ret != REG_NOERROR, 0)) 1184 return ret; 1185 preorder (dfa->str_tree, calc_next, dfa); 1186 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1187 if (BE (ret != REG_NOERROR, 0)) 1188 return ret; 1189 ret = calc_eclosure (dfa); 1190 if (BE (ret != REG_NOERROR, 0)) 1191 return ret; 1192 1193 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1194 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1195 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1196 || dfa->nbackref) 1197 { 1198 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1199 if (BE (dfa->inveclosures == NULL, 0)) 1200 return REG_ESPACE; 1201 ret = calc_inveclosure (dfa); 1202 } 1203 1204 return ret; 1205} 1206 1207/* Our parse trees are very unbalanced, so we cannot use a stack to 1208 implement parse tree visits. Instead, we use parent pointers and 1209 some hairy code in these two functions. */ 1210static reg_errcode_t 1211postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1212 void *extra) 1213{ 1214 bin_tree_t *node, *prev; 1215 1216 for (node = root; ; ) 1217 { 1218 /* Descend down the tree, preferably to the left (or to the right 1219 if that's the only child). */ 1220 while (node->left || node->right) 1221 if (node->left) 1222 node = node->left; 1223 else 1224 node = node->right; 1225 1226 do 1227 { 1228 reg_errcode_t err = fn (extra, node); 1229 if (BE (err != REG_NOERROR, 0)) 1230 return err; 1231 if (node->parent == NULL) 1232 return REG_NOERROR; 1233 prev = node; 1234 node = node->parent; 1235 } 1236 /* Go up while we have a node that is reached from the right. */ 1237 while (node->right == prev || node->right == NULL); 1238 node = node->right; 1239 } 1240} 1241 1242static reg_errcode_t 1243preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1244 void *extra) 1245{ 1246 bin_tree_t *node; 1247 1248 for (node = root; ; ) 1249 { 1250 reg_errcode_t err = fn (extra, node); 1251 if (BE (err != REG_NOERROR, 0)) 1252 return err; 1253 1254 /* Go to the left node, or up and to the right. */ 1255 if (node->left) 1256 node = node->left; 1257 else 1258 { 1259 bin_tree_t *prev = NULL; 1260 while (node->right == prev || node->right == NULL) 1261 { 1262 prev = node; 1263 node = node->parent; 1264 if (!node) 1265 return REG_NOERROR; 1266 } 1267 node = node->right; 1268 } 1269 } 1270} 1271 1272/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1273 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1274 backreferences as well. Requires a preorder visit. */ 1275static reg_errcode_t 1276optimize_subexps (void *extra, bin_tree_t *node) 1277{ 1278 re_dfa_t *dfa = (re_dfa_t *) extra; 1279 1280 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1281 { 1282 int idx = node->token.opr.idx; 1283 node->token.opr.idx = dfa->subexp_map[idx]; 1284 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1285 } 1286 1287 else if (node->token.type == SUBEXP 1288 && node->left && node->left->token.type == SUBEXP) 1289 { 1290 Idx other_idx = node->left->token.opr.idx; 1291 1292 node->left = node->left->left; 1293 if (node->left) 1294 node->left->parent = node; 1295 1296 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1297 if (other_idx < BITSET_WORD_BITS) 1298 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1299 } 1300 1301 return REG_NOERROR; 1302} 1303 1304/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1305 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1306static reg_errcode_t 1307lower_subexps (void *extra, bin_tree_t *node) 1308{ 1309 regex_t *preg = (regex_t *) extra; 1310 reg_errcode_t err = REG_NOERROR; 1311 1312 if (node->left && node->left->token.type == SUBEXP) 1313 { 1314 node->left = lower_subexp (&err, preg, node->left); 1315 if (node->left) 1316 node->left->parent = node; 1317 } 1318 if (node->right && node->right->token.type == SUBEXP) 1319 { 1320 node->right = lower_subexp (&err, preg, node->right); 1321 if (node->right) 1322 node->right->parent = node; 1323 } 1324 1325 return err; 1326} 1327 1328static bin_tree_t * 1329lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1330{ 1331 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1332 bin_tree_t *body = node->left; 1333 bin_tree_t *op, *cls, *tree1, *tree; 1334 1335 if (preg->no_sub 1336 /* We do not optimize empty subexpressions, because otherwise we may 1337 have bad CONCAT nodes with NULL children. This is obviously not 1338 very common, so we do not lose much. An example that triggers 1339 this case is the sed "script" /\(\)/x. */ 1340 && node->left != NULL 1341 && (node->token.opr.idx >= BITSET_WORD_BITS 1342 || !(dfa->used_bkref_map 1343 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1344 return node->left; 1345 1346 /* Convert the SUBEXP node to the concatenation of an 1347 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1348 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1349 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1350 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1351 tree = create_tree (dfa, op, tree1, CONCAT); 1352 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1353 { 1354 *err = REG_ESPACE; 1355 return NULL; 1356 } 1357 1358 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1359 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1360 return tree; 1361} 1362 1363/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1364 nodes. Requires a postorder visit. */ 1365static reg_errcode_t 1366calc_first (void *extra, bin_tree_t *node) 1367{ 1368 re_dfa_t *dfa = (re_dfa_t *) extra; 1369 if (node->token.type == CONCAT) 1370 { 1371 node->first = node->left->first; 1372 node->node_idx = node->left->node_idx; 1373 } 1374 else 1375 { 1376 node->first = node; 1377 node->node_idx = re_dfa_add_node (dfa, node->token); 1378 if (BE (node->node_idx == REG_MISSING, 0)) 1379 return REG_ESPACE; 1380 if (node->token.type == ANCHOR) 1381 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1382 } 1383 return REG_NOERROR; 1384} 1385 1386/* Pass 2: compute NEXT on the tree. Preorder visit. */ 1387static reg_errcode_t 1388calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node) 1389{ 1390 switch (node->token.type) 1391 { 1392 case OP_DUP_ASTERISK: 1393 node->left->next = node; 1394 break; 1395 case CONCAT: 1396 node->left->next = node->right->first; 1397 node->right->next = node->next; 1398 break; 1399 default: 1400 if (node->left) 1401 node->left->next = node->next; 1402 if (node->right) 1403 node->right->next = node->next; 1404 break; 1405 } 1406 return REG_NOERROR; 1407} 1408 1409/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1410static reg_errcode_t 1411link_nfa_nodes (void *extra, bin_tree_t *node) 1412{ 1413 re_dfa_t *dfa = (re_dfa_t *) extra; 1414 Idx idx = node->node_idx; 1415 reg_errcode_t err = REG_NOERROR; 1416 1417 switch (node->token.type) 1418 { 1419 case CONCAT: 1420 break; 1421 1422 case END_OF_RE: 1423 assert (node->next == NULL); 1424 break; 1425 1426 case OP_DUP_ASTERISK: 1427 case OP_ALT: 1428 { 1429 Idx left, right; 1430 dfa->has_plural_match = 1; 1431 if (node->left != NULL) 1432 left = node->left->first->node_idx; 1433 else 1434 left = node->next->node_idx; 1435 if (node->right != NULL) 1436 right = node->right->first->node_idx; 1437 else 1438 right = node->next->node_idx; 1439 assert (REG_VALID_INDEX (left)); 1440 assert (REG_VALID_INDEX (right)); 1441 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1442 } 1443 break; 1444 1445 case ANCHOR: 1446 case OP_OPEN_SUBEXP: 1447 case OP_CLOSE_SUBEXP: 1448 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1449 break; 1450 1451 case OP_BACK_REF: 1452 dfa->nexts[idx] = node->next->node_idx; 1453 if (node->token.type == OP_BACK_REF) 1454 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1455 break; 1456 1457 default: 1458 assert (!IS_EPSILON_NODE (node->token.type)); 1459 dfa->nexts[idx] = node->next->node_idx; 1460 break; 1461 } 1462 1463 return err; 1464} 1465 1466/* Duplicate the epsilon closure of the node ROOT_NODE. 1467 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1468 to their own constraint. */ 1469 1470static reg_errcode_t 1471internal_function 1472duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, 1473 Idx root_node, unsigned int init_constraint) 1474{ 1475 Idx org_node, clone_node; 1476 bool ok; 1477 unsigned int constraint = init_constraint; 1478 for (org_node = top_org_node, clone_node = top_clone_node;;) 1479 { 1480 Idx org_dest, clone_dest; 1481 if (dfa->nodes[org_node].type == OP_BACK_REF) 1482 { 1483 /* If the back reference epsilon-transit, its destination must 1484 also have the constraint. Then duplicate the epsilon closure 1485 of the destination of the back reference, and store it in 1486 edests of the back reference. */ 1487 org_dest = dfa->nexts[org_node]; 1488 re_node_set_empty (dfa->edests + clone_node); 1489 clone_dest = duplicate_node (dfa, org_dest, constraint); 1490 if (BE (clone_dest == REG_MISSING, 0)) 1491 return REG_ESPACE; 1492 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1493 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1494 if (BE (! ok, 0)) 1495 return REG_ESPACE; 1496 } 1497 else if (dfa->edests[org_node].nelem == 0) 1498 { 1499 /* In case of the node can't epsilon-transit, don't duplicate the 1500 destination and store the original destination as the 1501 destination of the node. */ 1502 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1503 break; 1504 } 1505 else if (dfa->edests[org_node].nelem == 1) 1506 { 1507 /* In case of the node can epsilon-transit, and it has only one 1508 destination. */ 1509 org_dest = dfa->edests[org_node].elems[0]; 1510 re_node_set_empty (dfa->edests + clone_node); 1511 /* If the node is root_node itself, it means the epsilon closure 1512 has a loop. Then tie it to the destination of the root_node. */ 1513 if (org_node == root_node && clone_node != org_node) 1514 { 1515 ok = re_node_set_insert (dfa->edests + clone_node, org_dest); 1516 if (BE (! ok, 0)) 1517 return REG_ESPACE; 1518 break; 1519 } 1520 /* In case the node has another constraint, append it. */ 1521 constraint |= dfa->nodes[org_node].constraint; 1522 clone_dest = duplicate_node (dfa, org_dest, constraint); 1523 if (BE (clone_dest == REG_MISSING, 0)) 1524 return REG_ESPACE; 1525 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1526 if (BE (! ok, 0)) 1527 return REG_ESPACE; 1528 } 1529 else /* dfa->edests[org_node].nelem == 2 */ 1530 { 1531 /* In case of the node can epsilon-transit, and it has two 1532 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1533 org_dest = dfa->edests[org_node].elems[0]; 1534 re_node_set_empty (dfa->edests + clone_node); 1535 /* Search for a duplicated node which satisfies the constraint. */ 1536 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1537 if (clone_dest == REG_MISSING) 1538 { 1539 /* There is no such duplicated node, create a new one. */ 1540 reg_errcode_t err; 1541 clone_dest = duplicate_node (dfa, org_dest, constraint); 1542 if (BE (clone_dest == REG_MISSING, 0)) 1543 return REG_ESPACE; 1544 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1545 if (BE (! ok, 0)) 1546 return REG_ESPACE; 1547 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1548 root_node, constraint); 1549 if (BE (err != REG_NOERROR, 0)) 1550 return err; 1551 } 1552 else 1553 { 1554 /* There is a duplicated node which satisfies the constraint, 1555 use it to avoid infinite loop. */ 1556 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1557 if (BE (! ok, 0)) 1558 return REG_ESPACE; 1559 } 1560 1561 org_dest = dfa->edests[org_node].elems[1]; 1562 clone_dest = duplicate_node (dfa, org_dest, constraint); 1563 if (BE (clone_dest == REG_MISSING, 0)) 1564 return REG_ESPACE; 1565 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1566 if (BE (! ok, 0)) 1567 return REG_ESPACE; 1568 } 1569 org_node = org_dest; 1570 clone_node = clone_dest; 1571 } 1572 return REG_NOERROR; 1573} 1574 1575/* Search for a node which is duplicated from the node ORG_NODE, and 1576 satisfies the constraint CONSTRAINT. */ 1577 1578static Idx 1579search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1580 unsigned int constraint) 1581{ 1582 Idx idx; 1583 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1584 { 1585 if (org_node == dfa->org_indices[idx] 1586 && constraint == dfa->nodes[idx].constraint) 1587 return idx; /* Found. */ 1588 } 1589 return REG_MISSING; /* Not found. */ 1590} 1591 1592/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1593 Return the index of the new node, or REG_MISSING if insufficient storage is 1594 available. */ 1595 1596static Idx 1597duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1598{ 1599 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1600 if (BE (dup_idx != REG_MISSING, 1)) 1601 { 1602 dfa->nodes[dup_idx].constraint = constraint; 1603 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1604 dfa->nodes[dup_idx].duplicated = 1; 1605 1606 /* Store the index of the original node. */ 1607 dfa->org_indices[dup_idx] = org_idx; 1608 } 1609 return dup_idx; 1610} 1611 1612static reg_errcode_t 1613calc_inveclosure (re_dfa_t *dfa) 1614{ 1615 Idx src, idx; 1616 bool ok; 1617 for (idx = 0; idx < dfa->nodes_len; ++idx) 1618 re_node_set_init_empty (dfa->inveclosures + idx); 1619 1620 for (src = 0; src < dfa->nodes_len; ++src) 1621 { 1622 Idx *elems = dfa->eclosures[src].elems; 1623 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1624 { 1625 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1626 if (BE (! ok, 0)) 1627 return REG_ESPACE; 1628 } 1629 } 1630 1631 return REG_NOERROR; 1632} 1633 1634/* Calculate "eclosure" for all the node in DFA. */ 1635 1636static reg_errcode_t 1637calc_eclosure (re_dfa_t *dfa) 1638{ 1639 Idx node_idx; 1640 bool incomplete; 1641#ifdef DEBUG 1642 assert (dfa->nodes_len > 0); 1643#endif 1644 incomplete = false; 1645 /* For each nodes, calculate epsilon closure. */ 1646 for (node_idx = 0; ; ++node_idx) 1647 { 1648 reg_errcode_t err; 1649 re_node_set eclosure_elem; 1650 if (node_idx == dfa->nodes_len) 1651 { 1652 if (!incomplete) 1653 break; 1654 incomplete = false; 1655 node_idx = 0; 1656 } 1657 1658#ifdef DEBUG 1659 assert (dfa->eclosures[node_idx].nelem != REG_MISSING); 1660#endif 1661 1662 /* If we have already calculated, skip it. */ 1663 if (dfa->eclosures[node_idx].nelem != 0) 1664 continue; 1665 /* Calculate epsilon closure of `node_idx'. */ 1666 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1667 if (BE (err != REG_NOERROR, 0)) 1668 return err; 1669 1670 if (dfa->eclosures[node_idx].nelem == 0) 1671 { 1672 incomplete = true; 1673 re_node_set_free (&eclosure_elem); 1674 } 1675 } 1676 return REG_NOERROR; 1677} 1678 1679/* Calculate epsilon closure of NODE. */ 1680 1681static reg_errcode_t 1682calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1683{ 1684 reg_errcode_t err; 1685 Idx i; 1686 bool incomplete; 1687 bool ok; 1688 re_node_set eclosure; 1689 incomplete = false; 1690 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1691 if (BE (err != REG_NOERROR, 0)) 1692 return err; 1693 1694 /* This indicates that we are calculating this node now. 1695 We reference this value to avoid infinite loop. */ 1696 dfa->eclosures[node].nelem = REG_MISSING; 1697 1698 /* If the current node has constraints, duplicate all nodes 1699 since they must inherit the constraints. */ 1700 if (dfa->nodes[node].constraint 1701 && dfa->edests[node].nelem 1702 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1703 { 1704 err = duplicate_node_closure (dfa, node, node, node, 1705 dfa->nodes[node].constraint); 1706 if (BE (err != REG_NOERROR, 0)) 1707 return err; 1708 } 1709 1710 /* Expand each epsilon destination nodes. */ 1711 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1712 for (i = 0; i < dfa->edests[node].nelem; ++i) 1713 { 1714 re_node_set eclosure_elem; 1715 Idx edest = dfa->edests[node].elems[i]; 1716 /* If calculating the epsilon closure of `edest' is in progress, 1717 return intermediate result. */ 1718 if (dfa->eclosures[edest].nelem == REG_MISSING) 1719 { 1720 incomplete = true; 1721 continue; 1722 } 1723 /* If we haven't calculated the epsilon closure of `edest' yet, 1724 calculate now. Otherwise use calculated epsilon closure. */ 1725 if (dfa->eclosures[edest].nelem == 0) 1726 { 1727 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1728 if (BE (err != REG_NOERROR, 0)) 1729 return err; 1730 } 1731 else 1732 eclosure_elem = dfa->eclosures[edest]; 1733 /* Merge the epsilon closure of `edest'. */ 1734 re_node_set_merge (&eclosure, &eclosure_elem); 1735 /* If the epsilon closure of `edest' is incomplete, 1736 the epsilon closure of this node is also incomplete. */ 1737 if (dfa->eclosures[edest].nelem == 0) 1738 { 1739 incomplete = true; 1740 re_node_set_free (&eclosure_elem); 1741 } 1742 } 1743 1744 /* Epsilon closures include itself. */ 1745 ok = re_node_set_insert (&eclosure, node); 1746 if (BE (! ok, 0)) 1747 return REG_ESPACE; 1748 if (incomplete && !root) 1749 dfa->eclosures[node].nelem = 0; 1750 else 1751 dfa->eclosures[node] = eclosure; 1752 *new_set = eclosure; 1753 return REG_NOERROR; 1754} 1755 1756/* Functions for token which are used in the parser. */ 1757 1758/* Fetch a token from INPUT. 1759 We must not use this function inside bracket expressions. */ 1760 1761static void 1762internal_function 1763fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1764{ 1765 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1766} 1767 1768/* Peek a token from INPUT, and return the length of the token. 1769 We must not use this function inside bracket expressions. */ 1770 1771static int 1772internal_function 1773peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1774{ 1775 unsigned char c; 1776 1777 if (re_string_eoi (input)) 1778 { 1779 token->type = END_OF_RE; 1780 return 0; 1781 } 1782 1783 c = re_string_peek_byte (input, 0); 1784 token->opr.c = c; 1785 1786 token->word_char = 0; 1787#ifdef RE_ENABLE_I18N 1788 token->mb_partial = 0; 1789 if (input->mb_cur_max > 1 && 1790 !re_string_first_byte (input, re_string_cur_idx (input))) 1791 { 1792 token->type = CHARACTER; 1793 token->mb_partial = 1; 1794 return 1; 1795 } 1796#endif 1797 if (c == '\\') 1798 { 1799 unsigned char c2; 1800 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1801 { 1802 token->type = BACK_SLASH; 1803 return 1; 1804 } 1805 1806 c2 = re_string_peek_byte_case (input, 1); 1807 token->opr.c = c2; 1808 token->type = CHARACTER; 1809#ifdef RE_ENABLE_I18N 1810 if (input->mb_cur_max > 1) 1811 { 1812 wint_t wc = re_string_wchar_at (input, 1813 re_string_cur_idx (input) + 1); 1814 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1815 } 1816 else 1817#endif 1818 token->word_char = IS_WORD_CHAR (c2) != 0; 1819 1820 switch (c2) 1821 { 1822 case '|': 1823 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1824 token->type = OP_ALT; 1825 break; 1826 case '1': case '2': case '3': case '4': case '5': 1827 case '6': case '7': case '8': case '9': 1828 if (!(syntax & RE_NO_BK_REFS)) 1829 { 1830 token->type = OP_BACK_REF; 1831 token->opr.idx = c2 - '1'; 1832 } 1833 break; 1834 case '<': 1835 if (!(syntax & RE_NO_GNU_OPS)) 1836 { 1837 token->type = ANCHOR; 1838 token->opr.ctx_type = WORD_FIRST; 1839 } 1840 break; 1841 case '>': 1842 if (!(syntax & RE_NO_GNU_OPS)) 1843 { 1844 token->type = ANCHOR; 1845 token->opr.ctx_type = WORD_LAST; 1846 } 1847 break; 1848 case 'b': 1849 if (!(syntax & RE_NO_GNU_OPS)) 1850 { 1851 token->type = ANCHOR; 1852 token->opr.ctx_type = WORD_DELIM; 1853 } 1854 break; 1855 case 'B': 1856 if (!(syntax & RE_NO_GNU_OPS)) 1857 { 1858 token->type = ANCHOR; 1859 token->opr.ctx_type = NOT_WORD_DELIM; 1860 } 1861 break; 1862 case 'w': 1863 if (!(syntax & RE_NO_GNU_OPS)) 1864 token->type = OP_WORD; 1865 break; 1866 case 'W': 1867 if (!(syntax & RE_NO_GNU_OPS)) 1868 token->type = OP_NOTWORD; 1869 break; 1870 case 's': 1871 if (!(syntax & RE_NO_GNU_OPS)) 1872 token->type = OP_SPACE; 1873 break; 1874 case 'S': 1875 if (!(syntax & RE_NO_GNU_OPS)) 1876 token->type = OP_NOTSPACE; 1877 break; 1878 case '`': 1879 if (!(syntax & RE_NO_GNU_OPS)) 1880 { 1881 token->type = ANCHOR; 1882 token->opr.ctx_type = BUF_FIRST; 1883 } 1884 break; 1885 case '\'': 1886 if (!(syntax & RE_NO_GNU_OPS)) 1887 { 1888 token->type = ANCHOR; 1889 token->opr.ctx_type = BUF_LAST; 1890 } 1891 break; 1892 case '(': 1893 if (!(syntax & RE_NO_BK_PARENS)) 1894 token->type = OP_OPEN_SUBEXP; 1895 break; 1896 case ')': 1897 if (!(syntax & RE_NO_BK_PARENS)) 1898 token->type = OP_CLOSE_SUBEXP; 1899 break; 1900 case '+': 1901 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1902 token->type = OP_DUP_PLUS; 1903 break; 1904 case '?': 1905 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1906 token->type = OP_DUP_QUESTION; 1907 break; 1908 case '{': 1909 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1910 token->type = OP_OPEN_DUP_NUM; 1911 break; 1912 case '}': 1913 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1914 token->type = OP_CLOSE_DUP_NUM; 1915 break; 1916 default: 1917 break; 1918 } 1919 return 2; 1920 } 1921 1922 token->type = CHARACTER; 1923#ifdef RE_ENABLE_I18N 1924 if (input->mb_cur_max > 1) 1925 { 1926 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1927 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1928 } 1929 else 1930#endif 1931 token->word_char = IS_WORD_CHAR (token->opr.c); 1932 1933 switch (c) 1934 { 1935 case '\n': 1936 if (syntax & RE_NEWLINE_ALT) 1937 token->type = OP_ALT; 1938 break; 1939 case '|': 1940 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1941 token->type = OP_ALT; 1942 break; 1943 case '*': 1944 token->type = OP_DUP_ASTERISK; 1945 break; 1946 case '+': 1947 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1948 token->type = OP_DUP_PLUS; 1949 break; 1950 case '?': 1951 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1952 token->type = OP_DUP_QUESTION; 1953 break; 1954 case '{': 1955 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1956 token->type = OP_OPEN_DUP_NUM; 1957 break; 1958 case '}': 1959 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1960 token->type = OP_CLOSE_DUP_NUM; 1961 break; 1962 case '(': 1963 if (syntax & RE_NO_BK_PARENS) 1964 token->type = OP_OPEN_SUBEXP; 1965 break; 1966 case ')': 1967 if (syntax & RE_NO_BK_PARENS) 1968 token->type = OP_CLOSE_SUBEXP; 1969 break; 1970 case '[': 1971 token->type = OP_OPEN_BRACKET; 1972 break; 1973 case '.': 1974 token->type = OP_PERIOD; 1975 break; 1976 case '^': 1977 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 1978 re_string_cur_idx (input) != 0) 1979 { 1980 char prev = re_string_peek_byte (input, -1); 1981 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 1982 break; 1983 } 1984 token->type = ANCHOR; 1985 token->opr.ctx_type = LINE_FIRST; 1986 break; 1987 case '$': 1988 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1989 re_string_cur_idx (input) + 1 != re_string_length (input)) 1990 { 1991 re_token_t next; 1992 re_string_skip_bytes (input, 1); 1993 peek_token (&next, input, syntax); 1994 re_string_skip_bytes (input, -1); 1995 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1996 break; 1997 } 1998 token->type = ANCHOR; 1999 token->opr.ctx_type = LINE_LAST; 2000 break; 2001 default: 2002 break; 2003 } 2004 return 1; 2005} 2006 2007/* Peek a token from INPUT, and return the length of the token. 2008 We must not use this function out of bracket expressions. */ 2009 2010static int 2011internal_function 2012peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2013{ 2014 unsigned char c; 2015 if (re_string_eoi (input)) 2016 { 2017 token->type = END_OF_RE; 2018 return 0; 2019 } 2020 c = re_string_peek_byte (input, 0); 2021 token->opr.c = c; 2022 2023#ifdef RE_ENABLE_I18N 2024 if (input->mb_cur_max > 1 && 2025 !re_string_first_byte (input, re_string_cur_idx (input))) 2026 { 2027 token->type = CHARACTER; 2028 return 1; 2029 } 2030#endif /* RE_ENABLE_I18N */ 2031 2032 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2033 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2034 { 2035 /* In this case, '\' escape a character. */ 2036 unsigned char c2; 2037 re_string_skip_bytes (input, 1); 2038 c2 = re_string_peek_byte (input, 0); 2039 token->opr.c = c2; 2040 token->type = CHARACTER; 2041 return 1; 2042 } 2043 if (c == '[') /* '[' is a special char in a bracket exps. */ 2044 { 2045 unsigned char c2; 2046 int token_len; 2047 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2048 c2 = re_string_peek_byte (input, 1); 2049 else 2050 c2 = 0; 2051 token->opr.c = c2; 2052 token_len = 2; 2053 switch (c2) 2054 { 2055 case '.': 2056 token->type = OP_OPEN_COLL_ELEM; 2057 break; 2058 case '=': 2059 token->type = OP_OPEN_EQUIV_CLASS; 2060 break; 2061 case ':': 2062 if (syntax & RE_CHAR_CLASSES) 2063 { 2064 token->type = OP_OPEN_CHAR_CLASS; 2065 break; 2066 } 2067 /* else fall through. */ 2068 default: 2069 token->type = CHARACTER; 2070 token->opr.c = c; 2071 token_len = 1; 2072 break; 2073 } 2074 return token_len; 2075 } 2076 switch (c) 2077 { 2078 case '-': 2079 token->type = OP_CHARSET_RANGE; 2080 break; 2081 case ']': 2082 token->type = OP_CLOSE_BRACKET; 2083 break; 2084 case '^': 2085 token->type = OP_NON_MATCH_LIST; 2086 break; 2087 default: 2088 token->type = CHARACTER; 2089 } 2090 return 1; 2091} 2092 2093/* Functions for parser. */ 2094 2095/* Entry point of the parser. 2096 Parse the regular expression REGEXP and return the structure tree. 2097 If an error is occured, ERR is set by error code, and return NULL. 2098 This function build the following tree, from regular expression <reg_exp>: 2099 CAT 2100 / \ 2101 / \ 2102 <reg_exp> EOR 2103 2104 CAT means concatenation. 2105 EOR means end of regular expression. */ 2106 2107static bin_tree_t * 2108parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2109 reg_errcode_t *err) 2110{ 2111 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2112 bin_tree_t *tree, *eor, *root; 2113 re_token_t current_token; 2114 dfa->syntax = syntax; 2115 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2116 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2117 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2118 return NULL; 2119 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2120 if (tree != NULL) 2121 root = create_tree (dfa, tree, eor, CONCAT); 2122 else 2123 root = eor; 2124 if (BE (eor == NULL || root == NULL, 0)) 2125 { 2126 *err = REG_ESPACE; 2127 return NULL; 2128 } 2129 return root; 2130} 2131 2132/* This function build the following tree, from regular expression 2133 <branch1>|<branch2>: 2134 ALT 2135 / \ 2136 / \ 2137 <branch1> <branch2> 2138 2139 ALT means alternative, which represents the operator `|'. */ 2140 2141static bin_tree_t * 2142parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2143 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2144{ 2145 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2146 bin_tree_t *tree, *branch = NULL; 2147 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2148 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2149 return NULL; 2150 2151 while (token->type == OP_ALT) 2152 { 2153 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2154 if (token->type != OP_ALT && token->type != END_OF_RE 2155 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2156 { 2157 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2158 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2159 return NULL; 2160 } 2161 else 2162 branch = NULL; 2163 tree = create_tree (dfa, tree, branch, OP_ALT); 2164 if (BE (tree == NULL, 0)) 2165 { 2166 *err = REG_ESPACE; 2167 return NULL; 2168 } 2169 } 2170 return tree; 2171} 2172 2173/* This function build the following tree, from regular expression 2174 <exp1><exp2>: 2175 CAT 2176 / \ 2177 / \ 2178 <exp1> <exp2> 2179 2180 CAT means concatenation. */ 2181 2182static bin_tree_t * 2183parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2184 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2185{ 2186 bin_tree_t *tree, *expr; 2187 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2188 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2189 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2190 return NULL; 2191 2192 while (token->type != OP_ALT && token->type != END_OF_RE 2193 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2194 { 2195 expr = parse_expression (regexp, preg, token, syntax, nest, err); 2196 if (BE (*err != REG_NOERROR && expr == NULL, 0)) 2197 { 2198 return NULL; 2199 } 2200 if (tree != NULL && expr != NULL) 2201 { 2202 tree = create_tree (dfa, tree, expr, CONCAT); 2203 if (tree == NULL) 2204 { 2205 *err = REG_ESPACE; 2206 return NULL; 2207 } 2208 } 2209 else if (tree == NULL) 2210 tree = expr; 2211 /* Otherwise expr == NULL, we don't need to create new tree. */ 2212 } 2213 return tree; 2214} 2215 2216/* This function build the following tree, from regular expression a*: 2217 * 2218 | 2219 a 2220*/ 2221 2222static bin_tree_t * 2223parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2224 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2225{ 2226 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2227 bin_tree_t *tree; 2228 switch (token->type) 2229 { 2230 case CHARACTER: 2231 tree = create_token_tree (dfa, NULL, NULL, token); 2232 if (BE (tree == NULL, 0)) 2233 { 2234 *err = REG_ESPACE; 2235 return NULL; 2236 } 2237#ifdef RE_ENABLE_I18N 2238 if (dfa->mb_cur_max > 1) 2239 { 2240 while (!re_string_eoi (regexp) 2241 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2242 { 2243 bin_tree_t *mbc_remain; 2244 fetch_token (token, regexp, syntax); 2245 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2246 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2247 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2248 { 2249 *err = REG_ESPACE; 2250 return NULL; 2251 } 2252 } 2253 } 2254#endif 2255 break; 2256 case OP_OPEN_SUBEXP: 2257 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2258 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2259 return NULL; 2260 break; 2261 case OP_OPEN_BRACKET: 2262 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2263 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2264 return NULL; 2265 break; 2266 case OP_BACK_REF: 2267 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2268 { 2269 *err = REG_ESUBREG; 2270 return NULL; 2271 } 2272 dfa->used_bkref_map |= 1 << token->opr.idx; 2273 tree = create_token_tree (dfa, NULL, NULL, token); 2274 if (BE (tree == NULL, 0)) 2275 { 2276 *err = REG_ESPACE; 2277 return NULL; 2278 } 2279 ++dfa->nbackref; 2280 dfa->has_mb_node = 1; 2281 break; 2282 case OP_OPEN_DUP_NUM: 2283 if (syntax & RE_CONTEXT_INVALID_DUP) 2284 { 2285 *err = REG_BADRPT; 2286 return NULL; 2287 } 2288 /* FALLTHROUGH */ 2289 case OP_DUP_ASTERISK: 2290 case OP_DUP_PLUS: 2291 case OP_DUP_QUESTION: 2292 if (syntax & RE_CONTEXT_INVALID_OPS) 2293 { 2294 *err = REG_BADRPT; 2295 return NULL; 2296 } 2297 else if (syntax & RE_CONTEXT_INDEP_OPS) 2298 { 2299 fetch_token (token, regexp, syntax); 2300 return parse_expression (regexp, preg, token, syntax, nest, err); 2301 } 2302 /* else fall through */ 2303 case OP_CLOSE_SUBEXP: 2304 if ((token->type == OP_CLOSE_SUBEXP) && 2305 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2306 { 2307 *err = REG_ERPAREN; 2308 return NULL; 2309 } 2310 /* else fall through */ 2311 case OP_CLOSE_DUP_NUM: 2312 /* We treat it as a normal character. */ 2313 2314 /* Then we can these characters as normal characters. */ 2315 token->type = CHARACTER; 2316 /* mb_partial and word_char bits should be initialized already 2317 by peek_token. */ 2318 tree = create_token_tree (dfa, NULL, NULL, token); 2319 if (BE (tree == NULL, 0)) 2320 { 2321 *err = REG_ESPACE; 2322 return NULL; 2323 } 2324 break; 2325 case ANCHOR: 2326 if ((token->opr.ctx_type 2327 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2328 && dfa->word_ops_used == 0) 2329 init_word_char (dfa); 2330 if (token->opr.ctx_type == WORD_DELIM 2331 || token->opr.ctx_type == NOT_WORD_DELIM) 2332 { 2333 bin_tree_t *tree_first, *tree_last; 2334 if (token->opr.ctx_type == WORD_DELIM) 2335 { 2336 token->opr.ctx_type = WORD_FIRST; 2337 tree_first = create_token_tree (dfa, NULL, NULL, token); 2338 token->opr.ctx_type = WORD_LAST; 2339 } 2340 else 2341 { 2342 token->opr.ctx_type = INSIDE_WORD; 2343 tree_first = create_token_tree (dfa, NULL, NULL, token); 2344 token->opr.ctx_type = INSIDE_NOTWORD; 2345 } 2346 tree_last = create_token_tree (dfa, NULL, NULL, token); 2347 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2348 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2349 { 2350 *err = REG_ESPACE; 2351 return NULL; 2352 } 2353 } 2354 else 2355 { 2356 tree = create_token_tree (dfa, NULL, NULL, token); 2357 if (BE (tree == NULL, 0)) 2358 { 2359 *err = REG_ESPACE; 2360 return NULL; 2361 } 2362 } 2363 /* We must return here, since ANCHORs can't be followed 2364 by repetition operators. 2365 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2366 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2367 fetch_token (token, regexp, syntax); 2368 return tree; 2369 case OP_PERIOD: 2370 tree = create_token_tree (dfa, NULL, NULL, token); 2371 if (BE (tree == NULL, 0)) 2372 { 2373 *err = REG_ESPACE; 2374 return NULL; 2375 } 2376 if (dfa->mb_cur_max > 1) 2377 dfa->has_mb_node = 1; 2378 break; 2379 case OP_WORD: 2380 case OP_NOTWORD: 2381 tree = build_charclass_op (dfa, regexp->trans, 2382 (const unsigned char *) "alnum", 2383 (const unsigned char *) "_", 2384 token->type == OP_NOTWORD, err); 2385 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2386 return NULL; 2387 break; 2388 case OP_SPACE: 2389 case OP_NOTSPACE: 2390 tree = build_charclass_op (dfa, regexp->trans, 2391 (const unsigned char *) "space", 2392 (const unsigned char *) "", 2393 token->type == OP_NOTSPACE, err); 2394 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2395 return NULL; 2396 break; 2397 case OP_ALT: 2398 case END_OF_RE: 2399 return NULL; 2400 case BACK_SLASH: 2401 *err = REG_EESCAPE; 2402 return NULL; 2403 default: 2404 /* Must not happen? */ 2405#ifdef DEBUG 2406 assert (0); 2407#endif 2408 return NULL; 2409 } 2410 fetch_token (token, regexp, syntax); 2411 2412 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2413 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2414 { 2415 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2416 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2417 return NULL; 2418 /* In BRE consecutive duplications are not allowed. */ 2419 if ((syntax & RE_CONTEXT_INVALID_DUP) 2420 && (token->type == OP_DUP_ASTERISK 2421 || token->type == OP_OPEN_DUP_NUM)) 2422 { 2423 *err = REG_BADRPT; 2424 return NULL; 2425 } 2426 } 2427 2428 return tree; 2429} 2430 2431/* This function build the following tree, from regular expression 2432 (<reg_exp>): 2433 SUBEXP 2434 | 2435 <reg_exp> 2436*/ 2437 2438static bin_tree_t * 2439parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2440 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2441{ 2442 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2443 bin_tree_t *tree; 2444 size_t cur_nsub; 2445 cur_nsub = preg->re_nsub++; 2446 2447 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2448 2449 /* The subexpression may be a null string. */ 2450 if (token->type == OP_CLOSE_SUBEXP) 2451 tree = NULL; 2452 else 2453 { 2454 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2455 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2456 *err = REG_EPAREN; 2457 if (BE (*err != REG_NOERROR, 0)) 2458 return NULL; 2459 } 2460 2461 if (cur_nsub <= '9' - '1') 2462 dfa->completed_bkref_map |= 1 << cur_nsub; 2463 2464 tree = create_tree (dfa, tree, NULL, SUBEXP); 2465 if (BE (tree == NULL, 0)) 2466 { 2467 *err = REG_ESPACE; 2468 return NULL; 2469 } 2470 tree->token.opr.idx = cur_nsub; 2471 return tree; 2472} 2473 2474/* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2475 2476static bin_tree_t * 2477parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2478 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2479{ 2480 bin_tree_t *tree = NULL, *old_tree = NULL; 2481 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2482 re_token_t start_token = *token; 2483 2484 if (token->type == OP_OPEN_DUP_NUM) 2485 { 2486 end = 0; 2487 start = fetch_number (regexp, token, syntax); 2488 if (start == REG_MISSING) 2489 { 2490 if (token->type == CHARACTER && token->opr.c == ',') 2491 start = 0; /* We treat "{,m}" as "{0,m}". */ 2492 else 2493 { 2494 *err = REG_BADBR; /* <re>{} is invalid. */ 2495 return NULL; 2496 } 2497 } 2498 if (BE (start != REG_ERROR, 1)) 2499 { 2500 /* We treat "{n}" as "{n,n}". */ 2501 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2502 : ((token->type == CHARACTER && token->opr.c == ',') 2503 ? fetch_number (regexp, token, syntax) : REG_ERROR)); 2504 } 2505 if (BE (start == REG_ERROR || end == REG_ERROR, 0)) 2506 { 2507 /* Invalid sequence. */ 2508 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2509 { 2510 if (token->type == END_OF_RE) 2511 *err = REG_EBRACE; 2512 else 2513 *err = REG_BADBR; 2514 2515 return NULL; 2516 } 2517 2518 /* If the syntax bit is set, rollback. */ 2519 re_string_set_index (regexp, start_idx); 2520 *token = start_token; 2521 token->type = CHARACTER; 2522 /* mb_partial and word_char bits should be already initialized by 2523 peek_token. */ 2524 return elem; 2525 } 2526 2527 if (BE ((end != REG_MISSING && start > end) 2528 || token->type != OP_CLOSE_DUP_NUM, 0)) 2529 { 2530 /* First number greater than second. */ 2531 *err = REG_BADBR; 2532 return NULL; 2533 } 2534 } 2535 else 2536 { 2537 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2538 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; 2539 } 2540 2541 fetch_token (token, regexp, syntax); 2542 2543 if (BE (elem == NULL, 0)) 2544 return NULL; 2545 if (BE (start == 0 && end == 0, 0)) 2546 { 2547 postorder (elem, free_tree, NULL); 2548 return NULL; 2549 } 2550 2551 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2552 if (BE (start > 0, 0)) 2553 { 2554 tree = elem; 2555 for (i = 2; i <= start; ++i) 2556 { 2557 elem = duplicate_tree (elem, dfa); 2558 tree = create_tree (dfa, tree, elem, CONCAT); 2559 if (BE (elem == NULL || tree == NULL, 0)) 2560 goto parse_dup_op_espace; 2561 } 2562 2563 if (start == end) 2564 return tree; 2565 2566 /* Duplicate ELEM before it is marked optional. */ 2567 elem = duplicate_tree (elem, dfa); 2568 old_tree = tree; 2569 } 2570 else 2571 old_tree = NULL; 2572 2573 if (elem->token.type == SUBEXP) 2574 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2575 2576 tree = create_tree (dfa, elem, NULL, 2577 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2578 if (BE (tree == NULL, 0)) 2579 goto parse_dup_op_espace; 2580 2581 /* This loop is actually executed only when end != REG_MISSING, 2582 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2583 already created the start+1-th copy. */ 2584 verify (! TYPE_SIGNED (Idx)); 2585 if (end != REG_MISSING) 2586 for (i = start + 2; i <= end; ++i) 2587 { 2588 elem = duplicate_tree (elem, dfa); 2589 tree = create_tree (dfa, tree, elem, CONCAT); 2590 if (BE (elem == NULL || tree == NULL, 0)) 2591 goto parse_dup_op_espace; 2592 2593 tree = create_tree (dfa, tree, NULL, OP_ALT); 2594 if (BE (tree == NULL, 0)) 2595 goto parse_dup_op_espace; 2596 } 2597 2598 if (old_tree) 2599 tree = create_tree (dfa, old_tree, tree, CONCAT); 2600 2601 return tree; 2602 2603 parse_dup_op_espace: 2604 *err = REG_ESPACE; 2605 return NULL; 2606} 2607 2608/* Size of the names for collating symbol/equivalence_class/character_class. 2609 I'm not sure, but maybe enough. */ 2610#define BRACKET_NAME_BUF_SIZE 32 2611 2612#ifndef _LIBC 2613 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2614 Build the range expression which starts from START_ELEM, and ends 2615 at END_ELEM. The result are written to MBCSET and SBCSET. 2616 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2617 mbcset->range_ends, is a pointer argument sinse we may 2618 update it. */ 2619 2620static reg_errcode_t 2621internal_function 2622# ifdef RE_ENABLE_I18N 2623build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, 2624 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2625# else /* not RE_ENABLE_I18N */ 2626build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, 2627 bracket_elem_t *end_elem) 2628# endif /* not RE_ENABLE_I18N */ 2629{ 2630 unsigned int start_ch, end_ch; 2631 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2632 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2633 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2634 0)) 2635 return REG_ERANGE; 2636 2637 /* We can handle no multi character collating elements without libc 2638 support. */ 2639 if (BE ((start_elem->type == COLL_SYM 2640 && strlen ((char *) start_elem->opr.name) > 1) 2641 || (end_elem->type == COLL_SYM 2642 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2643 return REG_ECOLLATE; 2644 2645# ifdef RE_ENABLE_I18N 2646 { 2647 wchar_t wc; 2648 wint_t start_wc; 2649 wint_t end_wc; 2650 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2651 2652 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2653 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2654 : 0)); 2655 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2656 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2657 : 0)); 2658 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2659 ? __btowc (start_ch) : start_elem->opr.wch); 2660 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2661 ? __btowc (end_ch) : end_elem->opr.wch); 2662 if (start_wc == WEOF || end_wc == WEOF) 2663 return REG_ECOLLATE; 2664 cmp_buf[0] = start_wc; 2665 cmp_buf[4] = end_wc; 2666 if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2667 return REG_ERANGE; 2668 2669 /* Got valid collation sequence values, add them as a new entry. 2670 However, for !_LIBC we have no collation elements: if the 2671 character set is single byte, the single byte character set 2672 that we build below suffices. parse_bracket_exp passes 2673 no MBCSET if dfa->mb_cur_max == 1. */ 2674 if (mbcset) 2675 { 2676 /* Check the space of the arrays. */ 2677 if (BE (*range_alloc == mbcset->nranges, 0)) 2678 { 2679 /* There is not enough space, need realloc. */ 2680 wchar_t *new_array_start, *new_array_end; 2681 Idx new_nranges; 2682 2683 /* +1 in case of mbcset->nranges is 0. */ 2684 new_nranges = 2 * mbcset->nranges + 1; 2685 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2686 are NULL if *range_alloc == 0. */ 2687 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2688 new_nranges); 2689 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2690 new_nranges); 2691 2692 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2693 return REG_ESPACE; 2694 2695 mbcset->range_starts = new_array_start; 2696 mbcset->range_ends = new_array_end; 2697 *range_alloc = new_nranges; 2698 } 2699 2700 mbcset->range_starts[mbcset->nranges] = start_wc; 2701 mbcset->range_ends[mbcset->nranges++] = end_wc; 2702 } 2703 2704 /* Build the table for single byte characters. */ 2705 for (wc = 0; wc < SBC_MAX; ++wc) 2706 { 2707 cmp_buf[2] = wc; 2708 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2709 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2710 bitset_set (sbcset, wc); 2711 } 2712 } 2713# else /* not RE_ENABLE_I18N */ 2714 { 2715 unsigned int ch; 2716 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2717 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2718 : 0)); 2719 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2720 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2721 : 0)); 2722 if (start_ch > end_ch) 2723 return REG_ERANGE; 2724 /* Build the table for single byte characters. */ 2725 for (ch = 0; ch < SBC_MAX; ++ch) 2726 if (start_ch <= ch && ch <= end_ch) 2727 bitset_set (sbcset, ch); 2728 } 2729# endif /* not RE_ENABLE_I18N */ 2730 return REG_NOERROR; 2731} 2732#endif /* not _LIBC */ 2733 2734#ifndef _LIBC 2735/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2736 Build the collating element which is represented by NAME. 2737 The result are written to MBCSET and SBCSET. 2738 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2739 pointer argument since we may update it. */ 2740 2741static reg_errcode_t 2742internal_function 2743build_collating_symbol (bitset_t sbcset, 2744# ifdef RE_ENABLE_I18N 2745 re_charset_t *mbcset _UNUSED_PARAMETER_, 2746 Idx *coll_sym_alloc _UNUSED_PARAMETER_, 2747# endif 2748 const unsigned char *name) 2749{ 2750 size_t name_len = strlen ((const char *) name); 2751 if (BE (name_len != 1, 0)) 2752 return REG_ECOLLATE; 2753 else 2754 { 2755 bitset_set (sbcset, name[0]); 2756 return REG_NOERROR; 2757 } 2758} 2759#endif /* not _LIBC */ 2760 2761/* This function parse bracket expression like "[abc]", "[a-c]", 2762 "[[.a-a.]]" etc. */ 2763 2764static bin_tree_t * 2765parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2766 reg_syntax_t syntax, reg_errcode_t *err) 2767{ 2768#ifdef _LIBC 2769 const unsigned char *collseqmb; 2770 const char *collseqwc; 2771 uint32_t nrules; 2772 int32_t table_size; 2773 const int32_t *symb_table; 2774 const unsigned char *extra; 2775 2776 /* Local function for parse_bracket_exp used in _LIBC environement. 2777 Seek the collating symbol entry correspondings to NAME. 2778 Return the index of the symbol in the SYMB_TABLE. */ 2779 2780 auto inline int32_t 2781 __attribute ((always_inline)) 2782 seek_collating_symbol_entry (name, name_len) 2783 const unsigned char *name; 2784 size_t name_len; 2785 { 2786 int32_t hash = elem_hash ((const char *) name, name_len); 2787 int32_t elem = hash % table_size; 2788 if (symb_table[2 * elem] != 0) 2789 { 2790 int32_t second = hash % (table_size - 2) + 1; 2791 2792 do 2793 { 2794 /* First compare the hashing value. */ 2795 if (symb_table[2 * elem] == hash 2796 /* Compare the length of the name. */ 2797 && name_len == extra[symb_table[2 * elem + 1]] 2798 /* Compare the name. */ 2799 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2800 name_len) == 0) 2801 { 2802 /* Yep, this is the entry. */ 2803 break; 2804 } 2805 2806 /* Next entry. */ 2807 elem += second; 2808 } 2809 while (symb_table[2 * elem] != 0); 2810 } 2811 return elem; 2812 } 2813 2814 /* Local function for parse_bracket_exp used in _LIBC environment. 2815 Look up the collation sequence value of BR_ELEM. 2816 Return the value if succeeded, UINT_MAX otherwise. */ 2817 2818 auto inline unsigned int 2819 __attribute ((always_inline)) 2820 lookup_collation_sequence_value (br_elem) 2821 bracket_elem_t *br_elem; 2822 { 2823 if (br_elem->type == SB_CHAR) 2824 { 2825 /* 2826 if (MB_CUR_MAX == 1) 2827 */ 2828 if (nrules == 0) 2829 return collseqmb[br_elem->opr.ch]; 2830 else 2831 { 2832 wint_t wc = __btowc (br_elem->opr.ch); 2833 return __collseq_table_lookup (collseqwc, wc); 2834 } 2835 } 2836 else if (br_elem->type == MB_CHAR) 2837 { 2838 if (nrules != 0) 2839 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2840 } 2841 else if (br_elem->type == COLL_SYM) 2842 { 2843 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2844 if (nrules != 0) 2845 { 2846 int32_t elem, idx; 2847 elem = seek_collating_symbol_entry (br_elem->opr.name, 2848 sym_name_len); 2849 if (symb_table[2 * elem] != 0) 2850 { 2851 /* We found the entry. */ 2852 idx = symb_table[2 * elem + 1]; 2853 /* Skip the name of collating element name. */ 2854 idx += 1 + extra[idx]; 2855 /* Skip the byte sequence of the collating element. */ 2856 idx += 1 + extra[idx]; 2857 /* Adjust for the alignment. */ 2858 idx = (idx + 3) & ~3; 2859 /* Skip the multibyte collation sequence value. */ 2860 idx += sizeof (unsigned int); 2861 /* Skip the wide char sequence of the collating element. */ 2862 idx += sizeof (unsigned int) * 2863 (1 + *(unsigned int *) (extra + idx)); 2864 /* Return the collation sequence value. */ 2865 return *(unsigned int *) (extra + idx); 2866 } 2867 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2868 { 2869 /* No valid character. Match it as a single byte 2870 character. */ 2871 return collseqmb[br_elem->opr.name[0]]; 2872 } 2873 } 2874 else if (sym_name_len == 1) 2875 return collseqmb[br_elem->opr.name[0]]; 2876 } 2877 return UINT_MAX; 2878 } 2879 2880 /* Local function for parse_bracket_exp used in _LIBC environement. 2881 Build the range expression which starts from START_ELEM, and ends 2882 at END_ELEM. The result are written to MBCSET and SBCSET. 2883 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2884 mbcset->range_ends, is a pointer argument sinse we may 2885 update it. */ 2886 2887 auto inline reg_errcode_t 2888 __attribute ((always_inline)) 2889 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2890 re_charset_t *mbcset; 2891 Idx *range_alloc; 2892 bitset_t sbcset; 2893 bracket_elem_t *start_elem, *end_elem; 2894 { 2895 unsigned int ch; 2896 uint32_t start_collseq; 2897 uint32_t end_collseq; 2898 2899 /* Equivalence Classes and Character Classes can't be a range 2900 start/end. */ 2901 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2902 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2903 0)) 2904 return REG_ERANGE; 2905 2906 start_collseq = lookup_collation_sequence_value (start_elem); 2907 end_collseq = lookup_collation_sequence_value (end_elem); 2908 /* Check start/end collation sequence values. */ 2909 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2910 return REG_ECOLLATE; 2911 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2912 return REG_ERANGE; 2913 2914 /* Got valid collation sequence values, add them as a new entry. 2915 However, if we have no collation elements, and the character set 2916 is single byte, the single byte character set that we 2917 build below suffices. */ 2918 if (nrules > 0 || dfa->mb_cur_max > 1) 2919 { 2920 /* Check the space of the arrays. */ 2921 if (BE (*range_alloc == mbcset->nranges, 0)) 2922 { 2923 /* There is not enough space, need realloc. */ 2924 uint32_t *new_array_start; 2925 uint32_t *new_array_end; 2926 Idx new_nranges; 2927 2928 /* +1 in case of mbcset->nranges is 0. */ 2929 new_nranges = 2 * mbcset->nranges + 1; 2930 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2931 new_nranges); 2932 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2933 new_nranges); 2934 2935 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2936 return REG_ESPACE; 2937 2938 mbcset->range_starts = new_array_start; 2939 mbcset->range_ends = new_array_end; 2940 *range_alloc = new_nranges; 2941 } 2942 2943 mbcset->range_starts[mbcset->nranges] = start_collseq; 2944 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2945 } 2946 2947 /* Build the table for single byte characters. */ 2948 for (ch = 0; ch < SBC_MAX; ch++) 2949 { 2950 uint32_t ch_collseq; 2951 /* 2952 if (MB_CUR_MAX == 1) 2953 */ 2954 if (nrules == 0) 2955 ch_collseq = collseqmb[ch]; 2956 else 2957 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2958 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2959 bitset_set (sbcset, ch); 2960 } 2961 return REG_NOERROR; 2962 } 2963 2964 /* Local function for parse_bracket_exp used in _LIBC environement. 2965 Build the collating element which is represented by NAME. 2966 The result are written to MBCSET and SBCSET. 2967 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2968 pointer argument sinse we may update it. */ 2969 2970 auto inline reg_errcode_t 2971 __attribute ((always_inline)) 2972 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2973 re_charset_t *mbcset; 2974 Idx *coll_sym_alloc; 2975 bitset_t sbcset; 2976 const unsigned char *name; 2977 { 2978 int32_t elem, idx; 2979 size_t name_len = strlen ((const char *) name); 2980 if (nrules != 0) 2981 { 2982 elem = seek_collating_symbol_entry (name, name_len); 2983 if (symb_table[2 * elem] != 0) 2984 { 2985 /* We found the entry. */ 2986 idx = symb_table[2 * elem + 1]; 2987 /* Skip the name of collating element name. */ 2988 idx += 1 + extra[idx]; 2989 } 2990 else if (symb_table[2 * elem] == 0 && name_len == 1) 2991 { 2992 /* No valid character, treat it as a normal 2993 character. */ 2994 bitset_set (sbcset, name[0]); 2995 return REG_NOERROR; 2996 } 2997 else 2998 return REG_ECOLLATE; 2999 3000 /* Got valid collation sequence, add it as a new entry. */ 3001 /* Check the space of the arrays. */ 3002 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3003 { 3004 /* Not enough, realloc it. */ 3005 /* +1 in case of mbcset->ncoll_syms is 0. */ 3006 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3007 /* Use realloc since mbcset->coll_syms is NULL 3008 if *alloc == 0. */ 3009 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3010 new_coll_sym_alloc); 3011 if (BE (new_coll_syms == NULL, 0)) 3012 return REG_ESPACE; 3013 mbcset->coll_syms = new_coll_syms; 3014 *coll_sym_alloc = new_coll_sym_alloc; 3015 } 3016 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3017 return REG_NOERROR; 3018 } 3019 else 3020 { 3021 if (BE (name_len != 1, 0)) 3022 return REG_ECOLLATE; 3023 else 3024 { 3025 bitset_set (sbcset, name[0]); 3026 return REG_NOERROR; 3027 } 3028 } 3029 } 3030#endif 3031 3032 re_token_t br_token; 3033 re_bitset_ptr_t sbcset; 3034#ifdef RE_ENABLE_I18N 3035 re_charset_t *mbcset; 3036 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3037 Idx equiv_class_alloc = 0, char_class_alloc = 0; 3038#endif /* not RE_ENABLE_I18N */ 3039 bool non_match = false; 3040 bin_tree_t *work_tree; 3041 int token_len; 3042 bool first_round = true; 3043#ifdef _LIBC 3044 collseqmb = (const unsigned char *) 3045 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3046 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3047 if (nrules) 3048 { 3049 /* 3050 if (MB_CUR_MAX > 1) 3051 */ 3052 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3053 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3054 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3055 _NL_COLLATE_SYMB_TABLEMB); 3056 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3057 _NL_COLLATE_SYMB_EXTRAMB); 3058 } 3059#endif 3060 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3061#ifdef RE_ENABLE_I18N 3062 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3063#endif /* RE_ENABLE_I18N */ 3064#ifdef RE_ENABLE_I18N 3065 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3066#else 3067 if (BE (sbcset == NULL, 0)) 3068#endif /* RE_ENABLE_I18N */ 3069 { 3070 *err = REG_ESPACE; 3071 return NULL; 3072 } 3073 3074 token_len = peek_token_bracket (token, regexp, syntax); 3075 if (BE (token->type == END_OF_RE, 0)) 3076 { 3077 *err = REG_BADPAT; 3078 goto parse_bracket_exp_free_return; 3079 } 3080 if (token->type == OP_NON_MATCH_LIST) 3081 { 3082#ifdef RE_ENABLE_I18N 3083 mbcset->non_match = 1; 3084#endif /* not RE_ENABLE_I18N */ 3085 non_match = true; 3086 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3087 bitset_set (sbcset, '\n'); 3088 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3089 token_len = peek_token_bracket (token, regexp, syntax); 3090 if (BE (token->type == END_OF_RE, 0)) 3091 { 3092 *err = REG_BADPAT; 3093 goto parse_bracket_exp_free_return; 3094 } 3095 } 3096 3097 /* We treat the first ']' as a normal character. */ 3098 if (token->type == OP_CLOSE_BRACKET) 3099 token->type = CHARACTER; 3100 3101 while (1) 3102 { 3103 bracket_elem_t start_elem, end_elem; 3104 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3105 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3106 reg_errcode_t ret; 3107 int token_len2 = 0; 3108 bool is_range_exp = false; 3109 re_token_t token2; 3110 3111 start_elem.opr.name = start_name_buf; 3112 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3113 syntax, first_round); 3114 if (BE (ret != REG_NOERROR, 0)) 3115 { 3116 *err = ret; 3117 goto parse_bracket_exp_free_return; 3118 } 3119 first_round = false; 3120 3121 /* Get information about the next token. We need it in any case. */ 3122 token_len = peek_token_bracket (token, regexp, syntax); 3123 3124 /* Do not check for ranges if we know they are not allowed. */ 3125 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3126 { 3127 if (BE (token->type == END_OF_RE, 0)) 3128 { 3129 *err = REG_EBRACK; 3130 goto parse_bracket_exp_free_return; 3131 } 3132 if (token->type == OP_CHARSET_RANGE) 3133 { 3134 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3135 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3136 if (BE (token2.type == END_OF_RE, 0)) 3137 { 3138 *err = REG_EBRACK; 3139 goto parse_bracket_exp_free_return; 3140 } 3141 if (token2.type == OP_CLOSE_BRACKET) 3142 { 3143 /* We treat the last '-' as a normal character. */ 3144 re_string_skip_bytes (regexp, -token_len); 3145 token->type = CHARACTER; 3146 } 3147 else 3148 is_range_exp = true; 3149 } 3150 } 3151 3152 if (is_range_exp == true) 3153 { 3154 end_elem.opr.name = end_name_buf; 3155 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3156 dfa, syntax, true); 3157 if (BE (ret != REG_NOERROR, 0)) 3158 { 3159 *err = ret; 3160 goto parse_bracket_exp_free_return; 3161 } 3162 3163 token_len = peek_token_bracket (token, regexp, syntax); 3164 3165#ifdef _LIBC 3166 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3167 &start_elem, &end_elem); 3168#else 3169# ifdef RE_ENABLE_I18N 3170 *err = build_range_exp (sbcset, 3171 dfa->mb_cur_max > 1 ? mbcset : NULL, 3172 &range_alloc, &start_elem, &end_elem); 3173# else 3174 *err = build_range_exp (sbcset, &start_elem, &end_elem); 3175# endif 3176#endif /* RE_ENABLE_I18N */ 3177 if (BE (*err != REG_NOERROR, 0)) 3178 goto parse_bracket_exp_free_return; 3179 } 3180 else 3181 { 3182 switch (start_elem.type) 3183 { 3184 case SB_CHAR: 3185 bitset_set (sbcset, start_elem.opr.ch); 3186 break; 3187#ifdef RE_ENABLE_I18N 3188 case MB_CHAR: 3189 /* Check whether the array has enough space. */ 3190 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3191 { 3192 wchar_t *new_mbchars; 3193 /* Not enough, realloc it. */ 3194 /* +1 in case of mbcset->nmbchars is 0. */ 3195 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3196 /* Use realloc since array is NULL if *alloc == 0. */ 3197 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3198 mbchar_alloc); 3199 if (BE (new_mbchars == NULL, 0)) 3200 goto parse_bracket_exp_espace; 3201 mbcset->mbchars = new_mbchars; 3202 } 3203 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3204 break; 3205#endif /* RE_ENABLE_I18N */ 3206 case EQUIV_CLASS: 3207 *err = build_equiv_class (sbcset, 3208#ifdef RE_ENABLE_I18N 3209 mbcset, &equiv_class_alloc, 3210#endif /* RE_ENABLE_I18N */ 3211 start_elem.opr.name); 3212 if (BE (*err != REG_NOERROR, 0)) 3213 goto parse_bracket_exp_free_return; 3214 break; 3215 case COLL_SYM: 3216 *err = build_collating_symbol (sbcset, 3217#ifdef RE_ENABLE_I18N 3218 mbcset, &coll_sym_alloc, 3219#endif /* RE_ENABLE_I18N */ 3220 start_elem.opr.name); 3221 if (BE (*err != REG_NOERROR, 0)) 3222 goto parse_bracket_exp_free_return; 3223 break; 3224 case CHAR_CLASS: 3225 *err = build_charclass (regexp->trans, sbcset, 3226#ifdef RE_ENABLE_I18N 3227 mbcset, &char_class_alloc, 3228#endif /* RE_ENABLE_I18N */ 3229 start_elem.opr.name, syntax); 3230 if (BE (*err != REG_NOERROR, 0)) 3231 goto parse_bracket_exp_free_return; 3232 break; 3233 default: 3234 assert (0); 3235 break; 3236 } 3237 } 3238 if (BE (token->type == END_OF_RE, 0)) 3239 { 3240 *err = REG_EBRACK; 3241 goto parse_bracket_exp_free_return; 3242 } 3243 if (token->type == OP_CLOSE_BRACKET) 3244 break; 3245 } 3246 3247 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3248 3249 /* If it is non-matching list. */ 3250 if (non_match) 3251 bitset_not (sbcset); 3252 3253#ifdef RE_ENABLE_I18N 3254 /* Ensure only single byte characters are set. */ 3255 if (dfa->mb_cur_max > 1) 3256 bitset_mask (sbcset, dfa->sb_char); 3257 3258 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3259 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3260 || mbcset->non_match))) 3261 { 3262 bin_tree_t *mbc_tree; 3263 int sbc_idx; 3264 /* Build a tree for complex bracket. */ 3265 dfa->has_mb_node = 1; 3266 br_token.type = COMPLEX_BRACKET; 3267 br_token.opr.mbcset = mbcset; 3268 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3269 if (BE (mbc_tree == NULL, 0)) 3270 goto parse_bracket_exp_espace; 3271 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3272 if (sbcset[sbc_idx]) 3273 break; 3274 /* If there are no bits set in sbcset, there is no point 3275 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3276 if (sbc_idx < BITSET_WORDS) 3277 { 3278 /* Build a tree for simple bracket. */ 3279 br_token.type = SIMPLE_BRACKET; 3280 br_token.opr.sbcset = sbcset; 3281 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3282 if (BE (work_tree == NULL, 0)) 3283 goto parse_bracket_exp_espace; 3284 3285 /* Then join them by ALT node. */ 3286 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3287 if (BE (work_tree == NULL, 0)) 3288 goto parse_bracket_exp_espace; 3289 } 3290 else 3291 { 3292 re_free (sbcset); 3293 work_tree = mbc_tree; 3294 } 3295 } 3296 else 3297#endif /* not RE_ENABLE_I18N */ 3298 { 3299#ifdef RE_ENABLE_I18N 3300 free_charset (mbcset); 3301#endif 3302 /* Build a tree for simple bracket. */ 3303 br_token.type = SIMPLE_BRACKET; 3304 br_token.opr.sbcset = sbcset; 3305 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3306 if (BE (work_tree == NULL, 0)) 3307 goto parse_bracket_exp_espace; 3308 } 3309 return work_tree; 3310 3311 parse_bracket_exp_espace: 3312 *err = REG_ESPACE; 3313 parse_bracket_exp_free_return: 3314 re_free (sbcset); 3315#ifdef RE_ENABLE_I18N 3316 free_charset (mbcset); 3317#endif /* RE_ENABLE_I18N */ 3318 return NULL; 3319} 3320 3321/* Parse an element in the bracket expression. */ 3322 3323static reg_errcode_t 3324parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3325 re_token_t *token, int token_len, 3326 re_dfa_t *dfa _UNUSED_PARAMETER_, 3327 reg_syntax_t syntax, bool accept_hyphen) 3328{ 3329#ifdef RE_ENABLE_I18N 3330 int cur_char_size; 3331 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3332 if (cur_char_size > 1) 3333 { 3334 elem->type = MB_CHAR; 3335 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3336 re_string_skip_bytes (regexp, cur_char_size); 3337 return REG_NOERROR; 3338 } 3339#endif /* RE_ENABLE_I18N */ 3340 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3341 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3342 || token->type == OP_OPEN_EQUIV_CLASS) 3343 return parse_bracket_symbol (elem, regexp, token); 3344 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3345 { 3346 /* A '-' must only appear as anything but a range indicator before 3347 the closing bracket. Everything else is an error. */ 3348 re_token_t token2; 3349 (void) peek_token_bracket (&token2, regexp, syntax); 3350 if (token2.type != OP_CLOSE_BRACKET) 3351 /* The actual error value is not standardized since this whole 3352 case is undefined. But ERANGE makes good sense. */ 3353 return REG_ERANGE; 3354 } 3355 elem->type = SB_CHAR; 3356 elem->opr.ch = token->opr.c; 3357 return REG_NOERROR; 3358} 3359 3360/* Parse a bracket symbol in the bracket expression. Bracket symbols are 3361 such as [:<character_class>:], [.<collating_element>.], and 3362 [=<equivalent_class>=]. */ 3363 3364static reg_errcode_t 3365parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3366 re_token_t *token) 3367{ 3368 unsigned char ch, delim = token->opr.c; 3369 int i = 0; 3370 if (re_string_eoi(regexp)) 3371 return REG_EBRACK; 3372 for (;; ++i) 3373 { 3374 if (i >= BRACKET_NAME_BUF_SIZE) 3375 return REG_EBRACK; 3376 if (token->type == OP_OPEN_CHAR_CLASS) 3377 ch = re_string_fetch_byte_case (regexp); 3378 else 3379 ch = re_string_fetch_byte (regexp); 3380 if (re_string_eoi(regexp)) 3381 return REG_EBRACK; 3382 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3383 break; 3384 elem->opr.name[i] = ch; 3385 } 3386 re_string_skip_bytes (regexp, 1); 3387 elem->opr.name[i] = '\0'; 3388 switch (token->type) 3389 { 3390 case OP_OPEN_COLL_ELEM: 3391 elem->type = COLL_SYM; 3392 break; 3393 case OP_OPEN_EQUIV_CLASS: 3394 elem->type = EQUIV_CLASS; 3395 break; 3396 case OP_OPEN_CHAR_CLASS: 3397 elem->type = CHAR_CLASS; 3398 break; 3399 default: 3400 break; 3401 } 3402 return REG_NOERROR; 3403} 3404 3405 /* Helper function for parse_bracket_exp. 3406 Build the equivalence class which is represented by NAME. 3407 The result are written to MBCSET and SBCSET. 3408 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3409 is a pointer argument sinse we may update it. */ 3410 3411static reg_errcode_t 3412#ifdef RE_ENABLE_I18N 3413build_equiv_class (bitset_t sbcset, re_charset_t *mbcset _UNUSED_PARAMETER_, 3414 Idx *equiv_class_alloc _UNUSED_PARAMETER_, 3415 const unsigned char *name) 3416#else /* not RE_ENABLE_I18N */ 3417build_equiv_class (bitset_t sbcset, const unsigned char *name) 3418#endif /* not RE_ENABLE_I18N */ 3419{ 3420#ifdef _LIBC 3421 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3422 if (nrules != 0) 3423 { 3424 const int32_t *table, *indirect; 3425 const unsigned char *weights, *extra, *cp; 3426 unsigned char char_buf[2]; 3427 int32_t idx1, idx2; 3428 unsigned int ch; 3429 size_t len; 3430 /* This #include defines a local function! */ 3431# include <locale/weight.h> 3432 /* Calculate the index for equivalence class. */ 3433 cp = name; 3434 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3435 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3436 _NL_COLLATE_WEIGHTMB); 3437 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3438 _NL_COLLATE_EXTRAMB); 3439 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3440 _NL_COLLATE_INDIRECTMB); 3441 idx1 = findidx (&cp); 3442 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3443 /* This isn't a valid character. */ 3444 return REG_ECOLLATE; 3445 3446 /* Build single byte matcing table for this equivalence class. */ 3447 char_buf[1] = (unsigned char) '\0'; 3448 len = weights[idx1 & 0xffffff]; 3449 for (ch = 0; ch < SBC_MAX; ++ch) 3450 { 3451 char_buf[0] = ch; 3452 cp = char_buf; 3453 idx2 = findidx (&cp); 3454/* 3455 idx2 = table[ch]; 3456*/ 3457 if (idx2 == 0) 3458 /* This isn't a valid character. */ 3459 continue; 3460 /* Compare only if the length matches and the collation rule 3461 index is the same. */ 3462 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3463 { 3464 int cnt = 0; 3465 3466 while (cnt <= len && 3467 weights[(idx1 & 0xffffff) + 1 + cnt] 3468 == weights[(idx2 & 0xffffff) + 1 + cnt]) 3469 ++cnt; 3470 3471 if (cnt > len) 3472 bitset_set (sbcset, ch); 3473 } 3474 } 3475 /* Check whether the array has enough space. */ 3476 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3477 { 3478 /* Not enough, realloc it. */ 3479 /* +1 in case of mbcset->nequiv_classes is 0. */ 3480 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3481 /* Use realloc since the array is NULL if *alloc == 0. */ 3482 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3483 int32_t, 3484 new_equiv_class_alloc); 3485 if (BE (new_equiv_classes == NULL, 0)) 3486 return REG_ESPACE; 3487 mbcset->equiv_classes = new_equiv_classes; 3488 *equiv_class_alloc = new_equiv_class_alloc; 3489 } 3490 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3491 } 3492 else 3493#endif /* _LIBC */ 3494 { 3495 if (BE (strlen ((const char *) name) != 1, 0)) 3496 return REG_ECOLLATE; 3497 bitset_set (sbcset, *name); 3498 } 3499 return REG_NOERROR; 3500} 3501 3502 /* Helper function for parse_bracket_exp. 3503 Build the character class which is represented by NAME. 3504 The result are written to MBCSET and SBCSET. 3505 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3506 is a pointer argument sinse we may update it. */ 3507 3508static reg_errcode_t 3509#ifdef RE_ENABLE_I18N 3510build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3511 re_charset_t *mbcset, Idx *char_class_alloc, 3512 const unsigned char *class_name, reg_syntax_t syntax) 3513#else /* not RE_ENABLE_I18N */ 3514build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3515 const unsigned char *class_name, reg_syntax_t syntax) 3516#endif /* not RE_ENABLE_I18N */ 3517{ 3518 int i; 3519 const char *name = (const char *) class_name; 3520 3521 /* In case of REG_ICASE "upper" and "lower" match the both of 3522 upper and lower cases. */ 3523 if ((syntax & RE_ICASE) 3524 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3525 name = "alpha"; 3526 3527#ifdef RE_ENABLE_I18N 3528 /* Check the space of the arrays. */ 3529 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3530 { 3531 /* Not enough, realloc it. */ 3532 /* +1 in case of mbcset->nchar_classes is 0. */ 3533 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3534 /* Use realloc since array is NULL if *alloc == 0. */ 3535 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3536 new_char_class_alloc); 3537 if (BE (new_char_classes == NULL, 0)) 3538 return REG_ESPACE; 3539 mbcset->char_classes = new_char_classes; 3540 *char_class_alloc = new_char_class_alloc; 3541 } 3542 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3543#endif /* RE_ENABLE_I18N */ 3544 3545#define BUILD_CHARCLASS_LOOP(ctype_func) \ 3546 do { \ 3547 if (BE (trans != NULL, 0)) \ 3548 { \ 3549 for (i = 0; i < SBC_MAX; ++i) \ 3550 if (ctype_func (i)) \ 3551 bitset_set (sbcset, trans[i]); \ 3552 } \ 3553 else \ 3554 { \ 3555 for (i = 0; i < SBC_MAX; ++i) \ 3556 if (ctype_func (i)) \ 3557 bitset_set (sbcset, i); \ 3558 } \ 3559 } while (0) 3560 3561 if (strcmp (name, "alnum") == 0) 3562 BUILD_CHARCLASS_LOOP (isalnum); 3563 else if (strcmp (name, "cntrl") == 0) 3564 BUILD_CHARCLASS_LOOP (iscntrl); 3565 else if (strcmp (name, "lower") == 0) 3566 BUILD_CHARCLASS_LOOP (islower); 3567 else if (strcmp (name, "space") == 0) 3568 BUILD_CHARCLASS_LOOP (isspace); 3569 else if (strcmp (name, "alpha") == 0) 3570 BUILD_CHARCLASS_LOOP (isalpha); 3571 else if (strcmp (name, "digit") == 0) 3572 BUILD_CHARCLASS_LOOP (isdigit); 3573 else if (strcmp (name, "print") == 0) 3574 BUILD_CHARCLASS_LOOP (isprint); 3575 else if (strcmp (name, "upper") == 0) 3576 BUILD_CHARCLASS_LOOP (isupper); 3577 else if (strcmp (name, "blank") == 0) 3578 BUILD_CHARCLASS_LOOP (isblank); 3579 else if (strcmp (name, "graph") == 0) 3580 BUILD_CHARCLASS_LOOP (isgraph); 3581 else if (strcmp (name, "punct") == 0) 3582 BUILD_CHARCLASS_LOOP (ispunct); 3583 else if (strcmp (name, "xdigit") == 0) 3584 BUILD_CHARCLASS_LOOP (isxdigit); 3585 else 3586 return REG_ECTYPE; 3587 3588 return REG_NOERROR; 3589} 3590 3591static bin_tree_t * 3592build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3593 const unsigned char *class_name, 3594 const unsigned char *extra, bool non_match, 3595 reg_errcode_t *err) 3596{ 3597 re_bitset_ptr_t sbcset; 3598#ifdef RE_ENABLE_I18N 3599 re_charset_t *mbcset; 3600 Idx alloc = 0; 3601#endif /* not RE_ENABLE_I18N */ 3602 reg_errcode_t ret; 3603 re_token_t br_token; 3604 bin_tree_t *tree; 3605 3606 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3607#ifdef RE_ENABLE_I18N 3608 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3609#endif /* RE_ENABLE_I18N */ 3610 3611#ifdef RE_ENABLE_I18N 3612 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3613#else /* not RE_ENABLE_I18N */ 3614 if (BE (sbcset == NULL, 0)) 3615#endif /* not RE_ENABLE_I18N */ 3616 { 3617 *err = REG_ESPACE; 3618 return NULL; 3619 } 3620 3621 if (non_match) 3622 { 3623#ifdef RE_ENABLE_I18N 3624 mbcset->non_match = 1; 3625#endif /* not RE_ENABLE_I18N */ 3626 } 3627 3628 /* We don't care the syntax in this case. */ 3629 ret = build_charclass (trans, sbcset, 3630#ifdef RE_ENABLE_I18N 3631 mbcset, &alloc, 3632#endif /* RE_ENABLE_I18N */ 3633 class_name, 0); 3634 3635 if (BE (ret != REG_NOERROR, 0)) 3636 { 3637 re_free (sbcset); 3638#ifdef RE_ENABLE_I18N 3639 free_charset (mbcset); 3640#endif /* RE_ENABLE_I18N */ 3641 *err = ret; 3642 return NULL; 3643 } 3644 /* \w match '_' also. */ 3645 for (; *extra; extra++) 3646 bitset_set (sbcset, *extra); 3647 3648 /* If it is non-matching list. */ 3649 if (non_match) 3650 bitset_not (sbcset); 3651 3652#ifdef RE_ENABLE_I18N 3653 /* Ensure only single byte characters are set. */ 3654 if (dfa->mb_cur_max > 1) 3655 bitset_mask (sbcset, dfa->sb_char); 3656#endif 3657 3658 /* Build a tree for simple bracket. */ 3659 br_token.type = SIMPLE_BRACKET; 3660 br_token.opr.sbcset = sbcset; 3661 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3662 if (BE (tree == NULL, 0)) 3663 goto build_word_op_espace; 3664 3665#ifdef RE_ENABLE_I18N 3666 if (dfa->mb_cur_max > 1) 3667 { 3668 bin_tree_t *mbc_tree; 3669 /* Build a tree for complex bracket. */ 3670 br_token.type = COMPLEX_BRACKET; 3671 br_token.opr.mbcset = mbcset; 3672 dfa->has_mb_node = 1; 3673 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3674 if (BE (mbc_tree == NULL, 0)) 3675 goto build_word_op_espace; 3676 /* Then join them by ALT node. */ 3677 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3678 if (BE (mbc_tree != NULL, 1)) 3679 return tree; 3680 } 3681 else 3682 { 3683 free_charset (mbcset); 3684 return tree; 3685 } 3686#else /* not RE_ENABLE_I18N */ 3687 return tree; 3688#endif /* not RE_ENABLE_I18N */ 3689 3690 build_word_op_espace: 3691 re_free (sbcset); 3692#ifdef RE_ENABLE_I18N 3693 free_charset (mbcset); 3694#endif /* RE_ENABLE_I18N */ 3695 *err = REG_ESPACE; 3696 return NULL; 3697} 3698 3699/* This is intended for the expressions like "a{1,3}". 3700 Fetch a number from `input', and return the number. 3701 Return REG_MISSING if the number field is empty like "{,1}". 3702 Return REG_ERROR if an error occurred. */ 3703 3704static Idx 3705fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3706{ 3707 Idx num = REG_MISSING; 3708 unsigned char c; 3709 while (1) 3710 { 3711 fetch_token (token, input, syntax); 3712 c = token->opr.c; 3713 if (BE (token->type == END_OF_RE, 0)) 3714 return REG_ERROR; 3715 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3716 break; 3717 num = ((token->type != CHARACTER || c < '0' || '9' < c 3718 || num == REG_ERROR) 3719 ? REG_ERROR 3720 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); 3721 num = (num > RE_DUP_MAX) ? REG_ERROR : num; 3722 } 3723 return num; 3724} 3725 3726#ifdef RE_ENABLE_I18N 3727static void 3728free_charset (re_charset_t *cset) 3729{ 3730 re_free (cset->mbchars); 3731# ifdef _LIBC 3732 re_free (cset->coll_syms); 3733 re_free (cset->equiv_classes); 3734 re_free (cset->range_starts); 3735 re_free (cset->range_ends); 3736# endif 3737 re_free (cset->char_classes); 3738 re_free (cset); 3739} 3740#endif /* RE_ENABLE_I18N */ 3741 3742/* Functions for binary tree operation. */ 3743 3744/* Create a tree node. */ 3745 3746static bin_tree_t * 3747create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3748 re_token_type_t type) 3749{ 3750 re_token_t t; 3751 t.type = type; 3752 return create_token_tree (dfa, left, right, &t); 3753} 3754 3755static bin_tree_t * 3756create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3757 const re_token_t *token) 3758{ 3759 bin_tree_t *tree; 3760 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3761 { 3762 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3763 3764 if (storage == NULL) 3765 return NULL; 3766 storage->next = dfa->str_tree_storage; 3767 dfa->str_tree_storage = storage; 3768 dfa->str_tree_storage_idx = 0; 3769 } 3770 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3771 3772 tree->parent = NULL; 3773 tree->left = left; 3774 tree->right = right; 3775 tree->token = *token; 3776 tree->token.duplicated = 0; 3777 tree->token.opt_subexp = 0; 3778 tree->first = NULL; 3779 tree->next = NULL; 3780 tree->node_idx = REG_MISSING; 3781 3782 if (left != NULL) 3783 left->parent = tree; 3784 if (right != NULL) 3785 right->parent = tree; 3786 return tree; 3787} 3788 3789/* Mark the tree SRC as an optional subexpression. 3790 To be called from preorder or postorder. */ 3791 3792static reg_errcode_t 3793mark_opt_subexp (void *extra, bin_tree_t *node) 3794{ 3795 Idx idx = (Idx) (long) extra; 3796 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3797 node->token.opt_subexp = 1; 3798 3799 return REG_NOERROR; 3800} 3801 3802/* Free the allocated memory inside NODE. */ 3803 3804static void 3805free_token (re_token_t *node) 3806{ 3807#ifdef RE_ENABLE_I18N 3808 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3809 free_charset (node->opr.mbcset); 3810 else 3811#endif /* RE_ENABLE_I18N */ 3812 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3813 re_free (node->opr.sbcset); 3814} 3815 3816/* Worker function for tree walking. Free the allocated memory inside NODE 3817 and its children. */ 3818 3819static reg_errcode_t 3820free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node) 3821{ 3822 free_token (&node->token); 3823 return REG_NOERROR; 3824} 3825 3826 3827/* Duplicate the node SRC, and return new node. This is a preorder 3828 visit similar to the one implemented by the generic visitor, but 3829 we need more infrastructure to maintain two parallel trees --- so, 3830 it's easier to duplicate. */ 3831 3832static bin_tree_t * 3833duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3834{ 3835 const bin_tree_t *node; 3836 bin_tree_t *dup_root; 3837 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3838 3839 for (node = root; ; ) 3840 { 3841 /* Create a new tree and link it back to the current parent. */ 3842 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3843 if (*p_new == NULL) 3844 return NULL; 3845 (*p_new)->parent = dup_node; 3846 (*p_new)->token.duplicated = 1; 3847 dup_node = *p_new; 3848 3849 /* Go to the left node, or up and to the right. */ 3850 if (node->left) 3851 { 3852 node = node->left; 3853 p_new = &dup_node->left; 3854 } 3855 else 3856 { 3857 const bin_tree_t *prev = NULL; 3858 while (node->right == prev || node->right == NULL) 3859 { 3860 prev = node; 3861 node = node->parent; 3862 dup_node = dup_node->parent; 3863 if (!node) 3864 return dup_root; 3865 } 3866 node = node->right; 3867 p_new = &dup_node->right; 3868 } 3869 } 3870} 3871