1/* Extended regular expression matching and search library. 2 Copyright (C) 2002-2014 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU General Public 8 License as published by the Free Software Foundation; either 9 version 3 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public 17 License along with the GNU C Library; if not, see 18 <http://www.gnu.org/licenses/>. */ 19 20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 21 size_t length, reg_syntax_t syntax); 22static void re_compile_fastmap_iter (regex_t *bufp, 23 const re_dfastate_t *init_state, 24 char *fastmap); 25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 26#ifdef RE_ENABLE_I18N 27static void free_charset (re_charset_t *cset); 28#endif /* RE_ENABLE_I18N */ 29static void free_workarea_compile (regex_t *preg); 30static reg_errcode_t create_initial_state (re_dfa_t *dfa); 31#ifdef RE_ENABLE_I18N 32static void optimize_utf8 (re_dfa_t *dfa); 33#endif 34static reg_errcode_t analyze (regex_t *preg); 35static reg_errcode_t preorder (bin_tree_t *root, 36 reg_errcode_t (fn (void *, bin_tree_t *)), 37 void *extra); 38static reg_errcode_t postorder (bin_tree_t *root, 39 reg_errcode_t (fn (void *, bin_tree_t *)), 40 void *extra); 41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 44 bin_tree_t *node); 45static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 46static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 48static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 49static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 50 unsigned int constraint); 51static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 53 Idx node, bool root); 54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 55static Idx fetch_number (re_string_t *input, re_token_t *token, 56 reg_syntax_t syntax); 57static int peek_token (re_token_t *token, re_string_t *input, 58 reg_syntax_t syntax) internal_function; 59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 60 reg_syntax_t syntax, reg_errcode_t *err); 61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 62 re_token_t *token, reg_syntax_t syntax, 63 Idx nest, reg_errcode_t *err); 64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 65 re_token_t *token, reg_syntax_t syntax, 66 Idx nest, reg_errcode_t *err); 67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 68 re_token_t *token, reg_syntax_t syntax, 69 Idx nest, reg_errcode_t *err); 70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 71 re_token_t *token, reg_syntax_t syntax, 72 Idx nest, reg_errcode_t *err); 73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 74 re_dfa_t *dfa, re_token_t *token, 75 reg_syntax_t syntax, reg_errcode_t *err); 76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 77 re_token_t *token, reg_syntax_t syntax, 78 reg_errcode_t *err); 79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 80 re_string_t *regexp, 81 re_token_t *token, int token_len, 82 re_dfa_t *dfa, 83 reg_syntax_t syntax, 84 bool accept_hyphen); 85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 86 re_string_t *regexp, 87 re_token_t *token); 88#ifdef RE_ENABLE_I18N 89static reg_errcode_t build_equiv_class (bitset_t sbcset, 90 re_charset_t *mbcset, 91 Idx *equiv_class_alloc, 92 const unsigned char *name); 93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 94 bitset_t sbcset, 95 re_charset_t *mbcset, 96 Idx *char_class_alloc, 97 const char *class_name, 98 reg_syntax_t syntax); 99#else /* not RE_ENABLE_I18N */ 100static reg_errcode_t build_equiv_class (bitset_t sbcset, 101 const unsigned char *name); 102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 103 bitset_t sbcset, 104 const char *class_name, 105 reg_syntax_t syntax); 106#endif /* not RE_ENABLE_I18N */ 107static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 108 RE_TRANSLATE_TYPE trans, 109 const char *class_name, 110 const char *extra, 111 bool non_match, reg_errcode_t *err); 112static bin_tree_t *create_tree (re_dfa_t *dfa, 113 bin_tree_t *left, bin_tree_t *right, 114 re_token_type_t type); 115static bin_tree_t *create_token_tree (re_dfa_t *dfa, 116 bin_tree_t *left, bin_tree_t *right, 117 const re_token_t *token); 118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 119static void free_token (re_token_t *node); 120static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 122 123/* This table gives an error message for each of the error codes listed 124 in regex.h. Obviously the order here has to be same as there. 125 POSIX doesn't require that we do anything for REG_NOERROR, 126 but why not be nice? */ 127 128static const char __re_error_msgid[] = 129 { 130#define REG_NOERROR_IDX 0 131 gettext_noop ("Success") /* REG_NOERROR */ 132 "\0" 133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 134 gettext_noop ("No match") /* REG_NOMATCH */ 135 "\0" 136#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 138 "\0" 139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 141 "\0" 142#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 144 "\0" 145#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 147 "\0" 148#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 150 "\0" 151#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 153 "\0" 154#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 156 "\0" 157#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 159 "\0" 160#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 162 "\0" 163#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 164 gettext_noop ("Invalid range end") /* REG_ERANGE */ 165 "\0" 166#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 167 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 168 "\0" 169#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 171 "\0" 172#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 173 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 174 "\0" 175#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 176 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 177 "\0" 178#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 180 }; 181 182static const size_t __re_error_msgid_idx[] = 183 { 184 REG_NOERROR_IDX, 185 REG_NOMATCH_IDX, 186 REG_BADPAT_IDX, 187 REG_ECOLLATE_IDX, 188 REG_ECTYPE_IDX, 189 REG_EESCAPE_IDX, 190 REG_ESUBREG_IDX, 191 REG_EBRACK_IDX, 192 REG_EPAREN_IDX, 193 REG_EBRACE_IDX, 194 REG_BADBR_IDX, 195 REG_ERANGE_IDX, 196 REG_ESPACE_IDX, 197 REG_BADRPT_IDX, 198 REG_EEND_IDX, 199 REG_ESIZE_IDX, 200 REG_ERPAREN_IDX 201 }; 202 203/* Entry points for GNU code. */ 204 205/* re_compile_pattern is the GNU regular expression compiler: it 206 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 207 Returns 0 if the pattern was valid, otherwise an error string. 208 209 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields 210 are set in BUFP on entry. */ 211 212#ifdef _LIBC 213const char * 214re_compile_pattern (pattern, length, bufp) 215 const char *pattern; 216 size_t length; 217 struct re_pattern_buffer *bufp; 218#else /* size_t might promote */ 219const char * 220re_compile_pattern (const char *pattern, size_t length, 221 struct re_pattern_buffer *bufp) 222#endif 223{ 224 reg_errcode_t ret; 225 226 /* And GNU code determines whether or not to get register information 227 by passing null for the REGS argument to re_match, etc., not by 228 setting no_sub, unless RE_NO_SUB is set. */ 229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 230 231 /* Match anchors at newline. */ 232 bufp->newline_anchor = 1; 233 234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 235 236 if (!ret) 237 return NULL; 238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 239} 240#ifdef _LIBC 241weak_alias (__re_compile_pattern, re_compile_pattern) 242#endif 243 244/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can 245 also be assigned to arbitrarily: each pattern buffer stores its own 246 syntax, so it can be changed between regex compilations. */ 247/* This has no initializer because initialized variables in Emacs 248 become read-only after dumping. */ 249reg_syntax_t re_syntax_options; 250 251 252/* Specify the precise syntax of regexps for compilation. This provides 253 for compatibility for various utilities which historically have 254 different, incompatible syntaxes. 255 256 The argument SYNTAX is a bit mask comprised of the various bits 257 defined in regex.h. We return the old syntax. */ 258 259reg_syntax_t 260re_set_syntax (syntax) 261 reg_syntax_t syntax; 262{ 263 reg_syntax_t ret = re_syntax_options; 264 265 re_syntax_options = syntax; 266 return ret; 267} 268#ifdef _LIBC 269weak_alias (__re_set_syntax, re_set_syntax) 270#endif 271 272int 273re_compile_fastmap (bufp) 274 struct re_pattern_buffer *bufp; 275{ 276 re_dfa_t *dfa = bufp->buffer; 277 char *fastmap = bufp->fastmap; 278 279 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 281 if (dfa->init_state != dfa->init_state_word) 282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 283 if (dfa->init_state != dfa->init_state_nl) 284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 285 if (dfa->init_state != dfa->init_state_begbuf) 286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 287 bufp->fastmap_accurate = 1; 288 return 0; 289} 290#ifdef _LIBC 291weak_alias (__re_compile_fastmap, re_compile_fastmap) 292#endif 293 294static inline void 295__attribute__ ((always_inline)) 296re_set_fastmap (char *fastmap, bool icase, int ch) 297{ 298 fastmap[ch] = 1; 299 if (icase) 300 fastmap[tolower (ch)] = 1; 301} 302 303/* Helper function for re_compile_fastmap. 304 Compile fastmap for the initial_state INIT_STATE. */ 305 306static void 307re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 308 char *fastmap) 309{ 310 re_dfa_t *dfa = bufp->buffer; 311 Idx node_cnt; 312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 314 { 315 Idx node = init_state->nodes.elems[node_cnt]; 316 re_token_type_t type = dfa->nodes[node].type; 317 318 if (type == CHARACTER) 319 { 320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 321#ifdef RE_ENABLE_I18N 322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 323 { 324 unsigned char buf[MB_LEN_MAX]; 325 unsigned char *p; 326 wchar_t wc; 327 mbstate_t state; 328 329 p = buf; 330 *p++ = dfa->nodes[node].opr.c; 331 while (++node < dfa->nodes_len 332 && dfa->nodes[node].type == CHARACTER 333 && dfa->nodes[node].mb_partial) 334 *p++ = dfa->nodes[node].opr.c; 335 memset (&state, '\0', sizeof (state)); 336 if (__mbrtowc (&wc, (const char *) buf, p - buf, 337 &state) == p - buf 338 && (__wcrtomb ((char *) buf, towlower (wc), &state) 339 != (size_t) -1)) 340 re_set_fastmap (fastmap, false, buf[0]); 341 } 342#endif 343 } 344 else if (type == SIMPLE_BRACKET) 345 { 346 int i, ch; 347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 348 { 349 int j; 350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 352 if (w & ((bitset_word_t) 1 << j)) 353 re_set_fastmap (fastmap, icase, ch); 354 } 355 } 356#ifdef RE_ENABLE_I18N 357 else if (type == COMPLEX_BRACKET) 358 { 359 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 360 Idx i; 361 362# ifdef _LIBC 363 /* See if we have to try all bytes which start multiple collation 364 elements. 365 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 366 collation element, and don't catch 'b' since 'b' is 367 the only collation element which starts from 'b' (and 368 it is caught by SIMPLE_BRACKET). */ 369 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 370 && (cset->ncoll_syms || cset->nranges)) 371 { 372 const int32_t *table = (const int32_t *) 373 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 374 for (i = 0; i < SBC_MAX; ++i) 375 if (table[i] < 0) 376 re_set_fastmap (fastmap, icase, i); 377 } 378# endif /* _LIBC */ 379 380 /* See if we have to start the match at all multibyte characters, 381 i.e. where we would not find an invalid sequence. This only 382 applies to multibyte character sets; for single byte character 383 sets, the SIMPLE_BRACKET again suffices. */ 384 if (dfa->mb_cur_max > 1 385 && (cset->nchar_classes || cset->non_match || cset->nranges 386# ifdef _LIBC 387 || cset->nequiv_classes 388# endif /* _LIBC */ 389 )) 390 { 391 unsigned char c = 0; 392 do 393 { 394 mbstate_t mbs; 395 memset (&mbs, 0, sizeof (mbs)); 396 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 397 re_set_fastmap (fastmap, false, (int) c); 398 } 399 while (++c != 0); 400 } 401 402 else 403 { 404 /* ... Else catch all bytes which can start the mbchars. */ 405 for (i = 0; i < cset->nmbchars; ++i) 406 { 407 char buf[256]; 408 mbstate_t state; 409 memset (&state, '\0', sizeof (state)); 410 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 412 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 413 { 414 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 415 != (size_t) -1) 416 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 417 } 418 } 419 } 420 } 421#endif /* RE_ENABLE_I18N */ 422 else if (type == OP_PERIOD 423#ifdef RE_ENABLE_I18N 424 || type == OP_UTF8_PERIOD 425#endif /* RE_ENABLE_I18N */ 426 || type == END_OF_RE) 427 { 428 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 429 if (type == END_OF_RE) 430 bufp->can_be_null = 1; 431 return; 432 } 433 } 434} 435 436/* Entry point for POSIX code. */ 437/* regcomp takes a regular expression as a string and compiles it. 438 439 PREG is a regex_t *. We do not expect any fields to be initialized, 440 since POSIX says we shouldn't. Thus, we set 441 442 'buffer' to the compiled pattern; 443 'used' to the length of the compiled pattern; 444 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the 445 REG_EXTENDED bit in CFLAGS is set; otherwise, to 446 RE_SYNTAX_POSIX_BASIC; 447 'newline_anchor' to REG_NEWLINE being set in CFLAGS; 448 'fastmap' to an allocated space for the fastmap; 449 'fastmap_accurate' to zero; 450 're_nsub' to the number of subexpressions in PATTERN. 451 452 PATTERN is the address of the pattern string. 453 454 CFLAGS is a series of bits which affect compilation. 455 456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 457 use POSIX basic syntax. 458 459 If REG_NEWLINE is set, then . and [^...] don't match newline. 460 Also, regexec will try a match beginning after every newline. 461 462 If REG_ICASE is set, then we considers upper- and lowercase 463 versions of letters to be equivalent when matching. 464 465 If REG_NOSUB is set, then when PREG is passed to regexec, that 466 routine will report only success or failure, and nothing about the 467 registers. 468 469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 470 the return codes and their meanings.) */ 471 472int 473regcomp (preg, pattern, cflags) 474 regex_t *_Restrict_ preg; 475 const char *_Restrict_ pattern; 476 int cflags; 477{ 478 reg_errcode_t ret; 479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 480 : RE_SYNTAX_POSIX_BASIC); 481 482 preg->buffer = NULL; 483 preg->allocated = 0; 484 preg->used = 0; 485 486 /* Try to allocate space for the fastmap. */ 487 preg->fastmap = re_malloc (char, SBC_MAX); 488 if (BE (preg->fastmap == NULL, 0)) 489 return REG_ESPACE; 490 491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 492 493 /* If REG_NEWLINE is set, newlines are treated differently. */ 494 if (cflags & REG_NEWLINE) 495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 496 syntax &= ~RE_DOT_NEWLINE; 497 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 498 /* It also changes the matching behavior. */ 499 preg->newline_anchor = 1; 500 } 501 else 502 preg->newline_anchor = 0; 503 preg->no_sub = !!(cflags & REG_NOSUB); 504 preg->translate = NULL; 505 506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 507 508 /* POSIX doesn't distinguish between an unmatched open-group and an 509 unmatched close-group: both are REG_EPAREN. */ 510 if (ret == REG_ERPAREN) 511 ret = REG_EPAREN; 512 513 /* We have already checked preg->fastmap != NULL. */ 514 if (BE (ret == REG_NOERROR, 1)) 515 /* Compute the fastmap now, since regexec cannot modify the pattern 516 buffer. This function never fails in this implementation. */ 517 (void) re_compile_fastmap (preg); 518 else 519 { 520 /* Some error occurred while compiling the expression. */ 521 re_free (preg->fastmap); 522 preg->fastmap = NULL; 523 } 524 525 return (int) ret; 526} 527#ifdef _LIBC 528weak_alias (__regcomp, regcomp) 529#endif 530 531/* Returns a message corresponding to an error code, ERRCODE, returned 532 from either regcomp or regexec. We don't use PREG here. */ 533 534#ifdef _LIBC 535size_t 536regerror (errcode, preg, errbuf, errbuf_size) 537 int errcode; 538 const regex_t *_Restrict_ preg; 539 char *_Restrict_ errbuf; 540 size_t errbuf_size; 541#else /* size_t might promote */ 542size_t 543regerror (int errcode, const regex_t *_Restrict_ preg, 544 char *_Restrict_ errbuf, size_t errbuf_size) 545#endif 546{ 547 const char *msg; 548 size_t msg_size; 549 550 if (BE (errcode < 0 551 || errcode >= (int) (sizeof (__re_error_msgid_idx) 552 / sizeof (__re_error_msgid_idx[0])), 0)) 553 /* Only error codes returned by the rest of the code should be passed 554 to this routine. If we are given anything else, or if other regex 555 code generates an invalid error code, then the program has a bug. 556 Dump core so we can fix it. */ 557 abort (); 558 559 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 560 561 msg_size = strlen (msg) + 1; /* Includes the null. */ 562 563 if (BE (errbuf_size != 0, 1)) 564 { 565 size_t cpy_size = msg_size; 566 if (BE (msg_size > errbuf_size, 0)) 567 { 568 cpy_size = errbuf_size - 1; 569 errbuf[cpy_size] = '\0'; 570 } 571 memcpy (errbuf, msg, cpy_size); 572 } 573 574 return msg_size; 575} 576#ifdef _LIBC 577weak_alias (__regerror, regerror) 578#endif 579 580 581#ifdef RE_ENABLE_I18N 582/* This static array is used for the map to single-byte characters when 583 UTF-8 is used. Otherwise we would allocate memory just to initialize 584 it the same all the time. UTF-8 is the preferred encoding so this is 585 a worthwhile optimization. */ 586static const bitset_t utf8_sb_map = 587{ 588 /* Set the first 128 bits. */ 589# if defined __GNUC__ && !defined __STRICT_ANSI__ 590 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX 591# else 592# if 4 * BITSET_WORD_BITS < ASCII_CHARS 593# error "bitset_word_t is narrower than 32 bits" 594# elif 3 * BITSET_WORD_BITS < ASCII_CHARS 595 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 596# elif 2 * BITSET_WORD_BITS < ASCII_CHARS 597 BITSET_WORD_MAX, BITSET_WORD_MAX, 598# elif 1 * BITSET_WORD_BITS < ASCII_CHARS 599 BITSET_WORD_MAX, 600# endif 601 (BITSET_WORD_MAX 602 >> (SBC_MAX % BITSET_WORD_BITS == 0 603 ? 0 604 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 605# endif 606}; 607#endif 608 609 610static void 611free_dfa_content (re_dfa_t *dfa) 612{ 613 Idx i, j; 614 615 if (dfa->nodes) 616 for (i = 0; i < dfa->nodes_len; ++i) 617 free_token (dfa->nodes + i); 618 re_free (dfa->nexts); 619 for (i = 0; i < dfa->nodes_len; ++i) 620 { 621 if (dfa->eclosures != NULL) 622 re_node_set_free (dfa->eclosures + i); 623 if (dfa->inveclosures != NULL) 624 re_node_set_free (dfa->inveclosures + i); 625 if (dfa->edests != NULL) 626 re_node_set_free (dfa->edests + i); 627 } 628 re_free (dfa->edests); 629 re_free (dfa->eclosures); 630 re_free (dfa->inveclosures); 631 re_free (dfa->nodes); 632 633 if (dfa->state_table) 634 for (i = 0; i <= dfa->state_hash_mask; ++i) 635 { 636 struct re_state_table_entry *entry = dfa->state_table + i; 637 for (j = 0; j < entry->num; ++j) 638 { 639 re_dfastate_t *state = entry->array[j]; 640 free_state (state); 641 } 642 re_free (entry->array); 643 } 644 re_free (dfa->state_table); 645#ifdef RE_ENABLE_I18N 646 if (dfa->sb_char != utf8_sb_map) 647 re_free (dfa->sb_char); 648#endif 649 re_free (dfa->subexp_map); 650#ifdef DEBUG 651 re_free (dfa->re_str); 652#endif 653 654 re_free (dfa); 655} 656 657 658/* Free dynamically allocated space used by PREG. */ 659 660void 661regfree (preg) 662 regex_t *preg; 663{ 664 re_dfa_t *dfa = preg->buffer; 665 if (BE (dfa != NULL, 1)) 666 { 667 lock_fini (dfa->lock); 668 free_dfa_content (dfa); 669 } 670 preg->buffer = NULL; 671 preg->allocated = 0; 672 673 re_free (preg->fastmap); 674 preg->fastmap = NULL; 675 676 re_free (preg->translate); 677 preg->translate = NULL; 678} 679#ifdef _LIBC 680weak_alias (__regfree, regfree) 681#endif 682 683/* Entry points compatible with 4.2 BSD regex library. We don't define 684 them unless specifically requested. */ 685 686#if defined _REGEX_RE_COMP || defined _LIBC 687 688/* BSD has one and only one pattern buffer. */ 689static struct re_pattern_buffer re_comp_buf; 690 691char * 692# ifdef _LIBC 693/* Make these definitions weak in libc, so POSIX programs can redefine 694 these names if they don't use our functions, and still use 695 regcomp/regexec above without link errors. */ 696weak_function 697# endif 698re_comp (s) 699 const char *s; 700{ 701 reg_errcode_t ret; 702 char *fastmap; 703 704 if (!s) 705 { 706 if (!re_comp_buf.buffer) 707 return gettext ("No previous regular expression"); 708 return 0; 709 } 710 711 if (re_comp_buf.buffer) 712 { 713 fastmap = re_comp_buf.fastmap; 714 re_comp_buf.fastmap = NULL; 715 __regfree (&re_comp_buf); 716 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 717 re_comp_buf.fastmap = fastmap; 718 } 719 720 if (re_comp_buf.fastmap == NULL) 721 { 722 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 723 if (re_comp_buf.fastmap == NULL) 724 return (char *) gettext (__re_error_msgid 725 + __re_error_msgid_idx[(int) REG_ESPACE]); 726 } 727 728 /* Since 're_exec' always passes NULL for the 'regs' argument, we 729 don't need to initialize the pattern buffer fields which affect it. */ 730 731 /* Match anchors at newlines. */ 732 re_comp_buf.newline_anchor = 1; 733 734 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 735 736 if (!ret) 737 return NULL; 738 739 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */ 740 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 741} 742 743#ifdef _LIBC 744libc_freeres_fn (free_mem) 745{ 746 __regfree (&re_comp_buf); 747} 748#endif 749 750#endif /* _REGEX_RE_COMP */ 751 752/* Internal entry point. 753 Compile the regular expression PATTERN, whose length is LENGTH. 754 SYNTAX indicate regular expression's syntax. */ 755 756static reg_errcode_t 757re_compile_internal (regex_t *preg, const char * pattern, size_t length, 758 reg_syntax_t syntax) 759{ 760 reg_errcode_t err = REG_NOERROR; 761 re_dfa_t *dfa; 762 re_string_t regexp; 763 764 /* Initialize the pattern buffer. */ 765 preg->fastmap_accurate = 0; 766 preg->syntax = syntax; 767 preg->not_bol = preg->not_eol = 0; 768 preg->used = 0; 769 preg->re_nsub = 0; 770 preg->can_be_null = 0; 771 preg->regs_allocated = REGS_UNALLOCATED; 772 773 /* Initialize the dfa. */ 774 dfa = preg->buffer; 775 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 776 { 777 /* If zero allocated, but buffer is non-null, try to realloc 778 enough space. This loses if buffer's address is bogus, but 779 that is the user's responsibility. If ->buffer is NULL this 780 is a simple allocation. */ 781 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 782 if (dfa == NULL) 783 return REG_ESPACE; 784 preg->allocated = sizeof (re_dfa_t); 785 preg->buffer = dfa; 786 } 787 preg->used = sizeof (re_dfa_t); 788 789 err = init_dfa (dfa, length); 790 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0)) 791 err = REG_ESPACE; 792 if (BE (err != REG_NOERROR, 0)) 793 { 794 free_dfa_content (dfa); 795 preg->buffer = NULL; 796 preg->allocated = 0; 797 return err; 798 } 799#ifdef DEBUG 800 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 801 dfa->re_str = re_malloc (char, length + 1); 802 strncpy (dfa->re_str, pattern, length + 1); 803#endif 804 805 err = re_string_construct (®exp, pattern, length, preg->translate, 806 (syntax & RE_ICASE) != 0, dfa); 807 if (BE (err != REG_NOERROR, 0)) 808 { 809 re_compile_internal_free_return: 810 free_workarea_compile (preg); 811 re_string_destruct (®exp); 812 lock_fini (dfa->lock); 813 free_dfa_content (dfa); 814 preg->buffer = NULL; 815 preg->allocated = 0; 816 return err; 817 } 818 819 /* Parse the regular expression, and build a structure tree. */ 820 preg->re_nsub = 0; 821 dfa->str_tree = parse (®exp, preg, syntax, &err); 822 if (BE (dfa->str_tree == NULL, 0)) 823 goto re_compile_internal_free_return; 824 825 /* Analyze the tree and create the nfa. */ 826 err = analyze (preg); 827 if (BE (err != REG_NOERROR, 0)) 828 goto re_compile_internal_free_return; 829 830#ifdef RE_ENABLE_I18N 831 /* If possible, do searching in single byte encoding to speed things up. */ 832 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 833 optimize_utf8 (dfa); 834#endif 835 836 /* Then create the initial state of the dfa. */ 837 err = create_initial_state (dfa); 838 839 /* Release work areas. */ 840 free_workarea_compile (preg); 841 re_string_destruct (®exp); 842 843 if (BE (err != REG_NOERROR, 0)) 844 { 845 lock_fini (dfa->lock); 846 free_dfa_content (dfa); 847 preg->buffer = NULL; 848 preg->allocated = 0; 849 } 850 851 return err; 852} 853 854/* Initialize DFA. We use the length of the regular expression PAT_LEN 855 as the initial length of some arrays. */ 856 857static reg_errcode_t 858init_dfa (re_dfa_t *dfa, size_t pat_len) 859{ 860 __re_size_t table_size; 861#ifndef _LIBC 862 const char *codeset_name; 863#endif 864#ifdef RE_ENABLE_I18N 865 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); 866#else 867 size_t max_i18n_object_size = 0; 868#endif 869 size_t max_object_size = 870 MAX (sizeof (struct re_state_table_entry), 871 MAX (sizeof (re_token_t), 872 MAX (sizeof (re_node_set), 873 MAX (sizeof (regmatch_t), 874 max_i18n_object_size)))); 875 876 memset (dfa, '\0', sizeof (re_dfa_t)); 877 878 /* Force allocation of str_tree_storage the first time. */ 879 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 880 881 /* Avoid overflows. The extra "/ 2" is for the table_size doubling 882 calculation below, and for similar doubling calculations 883 elsewhere. And it's <= rather than <, because some of the 884 doubling calculations add 1 afterwards. */ 885 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0)) 886 return REG_ESPACE; 887 888 dfa->nodes_alloc = pat_len + 1; 889 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 890 891 /* table_size = 2 ^ ceil(log pat_len) */ 892 for (table_size = 1; ; table_size <<= 1) 893 if (table_size > pat_len) 894 break; 895 896 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 897 dfa->state_hash_mask = table_size - 1; 898 899 dfa->mb_cur_max = MB_CUR_MAX; 900#ifdef _LIBC 901 if (dfa->mb_cur_max == 6 902 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 903 dfa->is_utf8 = 1; 904 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 905 != 0); 906#else 907 codeset_name = nl_langinfo (CODESET); 908 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u') 909 && (codeset_name[1] == 'T' || codeset_name[1] == 't') 910 && (codeset_name[2] == 'F' || codeset_name[2] == 'f') 911 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0) 912 dfa->is_utf8 = 1; 913 914 /* We check exhaustively in the loop below if this charset is a 915 superset of ASCII. */ 916 dfa->map_notascii = 0; 917#endif 918 919#ifdef RE_ENABLE_I18N 920 if (dfa->mb_cur_max > 1) 921 { 922 if (dfa->is_utf8) 923 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 924 else 925 { 926 int i, j, ch; 927 928 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 929 if (BE (dfa->sb_char == NULL, 0)) 930 return REG_ESPACE; 931 932 /* Set the bits corresponding to single byte chars. */ 933 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 934 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 935 { 936 wint_t wch = __btowc (ch); 937 if (wch != WEOF) 938 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 939# ifndef _LIBC 940 if (isascii (ch) && wch != ch) 941 dfa->map_notascii = 1; 942# endif 943 } 944 } 945 } 946#endif 947 948 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 949 return REG_ESPACE; 950 return REG_NOERROR; 951} 952 953/* Initialize WORD_CHAR table, which indicate which character is 954 "word". In this case "word" means that it is the word construction 955 character used by some operators like "\<", "\>", etc. */ 956 957static void 958internal_function 959init_word_char (re_dfa_t *dfa) 960{ 961 int i = 0; 962 int j; 963 int ch = 0; 964 dfa->word_ops_used = 1; 965 if (BE (dfa->map_notascii == 0, 1)) 966 { 967 bitset_word_t bits0 = 0x00000000; 968 bitset_word_t bits1 = 0x03ff0000; 969 bitset_word_t bits2 = 0x87fffffe; 970 bitset_word_t bits3 = 0x07fffffe; 971 if (BITSET_WORD_BITS == 64) 972 { 973 dfa->word_char[0] = bits1 << 31 << 1 | bits0; 974 dfa->word_char[1] = bits3 << 31 << 1 | bits2; 975 i = 2; 976 } 977 else if (BITSET_WORD_BITS == 32) 978 { 979 dfa->word_char[0] = bits0; 980 dfa->word_char[1] = bits1; 981 dfa->word_char[2] = bits2; 982 dfa->word_char[3] = bits3; 983 i = 4; 984 } 985 else 986 goto general_case; 987 ch = 128; 988 989 if (BE (dfa->is_utf8, 1)) 990 { 991 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); 992 return; 993 } 994 } 995 996 general_case: 997 for (; i < BITSET_WORDS; ++i) 998 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 999 if (isalnum (ch) || ch == '_') 1000 dfa->word_char[i] |= (bitset_word_t) 1 << j; 1001} 1002 1003/* Free the work area which are only used while compiling. */ 1004 1005static void 1006free_workarea_compile (regex_t *preg) 1007{ 1008 re_dfa_t *dfa = preg->buffer; 1009 bin_tree_storage_t *storage, *next; 1010 for (storage = dfa->str_tree_storage; storage; storage = next) 1011 { 1012 next = storage->next; 1013 re_free (storage); 1014 } 1015 dfa->str_tree_storage = NULL; 1016 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 1017 dfa->str_tree = NULL; 1018 re_free (dfa->org_indices); 1019 dfa->org_indices = NULL; 1020} 1021 1022/* Create initial states for all contexts. */ 1023 1024static reg_errcode_t 1025create_initial_state (re_dfa_t *dfa) 1026{ 1027 Idx first, i; 1028 reg_errcode_t err; 1029 re_node_set init_nodes; 1030 1031 /* Initial states have the epsilon closure of the node which is 1032 the first node of the regular expression. */ 1033 first = dfa->str_tree->first->node_idx; 1034 dfa->init_node = first; 1035 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 1036 if (BE (err != REG_NOERROR, 0)) 1037 return err; 1038 1039 /* The back-references which are in initial states can epsilon transit, 1040 since in this case all of the subexpressions can be null. 1041 Then we add epsilon closures of the nodes which are the next nodes of 1042 the back-references. */ 1043 if (dfa->nbackref > 0) 1044 for (i = 0; i < init_nodes.nelem; ++i) 1045 { 1046 Idx node_idx = init_nodes.elems[i]; 1047 re_token_type_t type = dfa->nodes[node_idx].type; 1048 1049 Idx clexp_idx; 1050 if (type != OP_BACK_REF) 1051 continue; 1052 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1053 { 1054 re_token_t *clexp_node; 1055 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1056 if (clexp_node->type == OP_CLOSE_SUBEXP 1057 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1058 break; 1059 } 1060 if (clexp_idx == init_nodes.nelem) 1061 continue; 1062 1063 if (type == OP_BACK_REF) 1064 { 1065 Idx dest_idx = dfa->edests[node_idx].elems[0]; 1066 if (!re_node_set_contains (&init_nodes, dest_idx)) 1067 { 1068 reg_errcode_t merge_err 1069 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1070 if (merge_err != REG_NOERROR) 1071 return merge_err; 1072 i = 0; 1073 } 1074 } 1075 } 1076 1077 /* It must be the first time to invoke acquire_state. */ 1078 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1079 /* We don't check ERR here, since the initial state must not be NULL. */ 1080 if (BE (dfa->init_state == NULL, 0)) 1081 return err; 1082 if (dfa->init_state->has_constraint) 1083 { 1084 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1085 CONTEXT_WORD); 1086 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1087 CONTEXT_NEWLINE); 1088 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1089 &init_nodes, 1090 CONTEXT_NEWLINE 1091 | CONTEXT_BEGBUF); 1092 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1093 || dfa->init_state_begbuf == NULL, 0)) 1094 return err; 1095 } 1096 else 1097 dfa->init_state_word = dfa->init_state_nl 1098 = dfa->init_state_begbuf = dfa->init_state; 1099 1100 re_node_set_free (&init_nodes); 1101 return REG_NOERROR; 1102} 1103 1104#ifdef RE_ENABLE_I18N 1105/* If it is possible to do searching in single byte encoding instead of UTF-8 1106 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1107 DFA nodes where needed. */ 1108 1109static void 1110optimize_utf8 (re_dfa_t *dfa) 1111{ 1112 Idx node; 1113 int i; 1114 bool mb_chars = false; 1115 bool has_period = false; 1116 1117 for (node = 0; node < dfa->nodes_len; ++node) 1118 switch (dfa->nodes[node].type) 1119 { 1120 case CHARACTER: 1121 if (dfa->nodes[node].opr.c >= ASCII_CHARS) 1122 mb_chars = true; 1123 break; 1124 case ANCHOR: 1125 switch (dfa->nodes[node].opr.ctx_type) 1126 { 1127 case LINE_FIRST: 1128 case LINE_LAST: 1129 case BUF_FIRST: 1130 case BUF_LAST: 1131 break; 1132 default: 1133 /* Word anchors etc. cannot be handled. It's okay to test 1134 opr.ctx_type since constraints (for all DFA nodes) are 1135 created by ORing one or more opr.ctx_type values. */ 1136 return; 1137 } 1138 break; 1139 case OP_PERIOD: 1140 has_period = true; 1141 break; 1142 case OP_BACK_REF: 1143 case OP_ALT: 1144 case END_OF_RE: 1145 case OP_DUP_ASTERISK: 1146 case OP_OPEN_SUBEXP: 1147 case OP_CLOSE_SUBEXP: 1148 break; 1149 case COMPLEX_BRACKET: 1150 return; 1151 case SIMPLE_BRACKET: 1152 /* Just double check. */ 1153 { 1154 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 1155 ? 0 1156 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); 1157 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1158 { 1159 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1160 return; 1161 rshift = 0; 1162 } 1163 } 1164 break; 1165 default: 1166 abort (); 1167 } 1168 1169 if (mb_chars || has_period) 1170 for (node = 0; node < dfa->nodes_len; ++node) 1171 { 1172 if (dfa->nodes[node].type == CHARACTER 1173 && dfa->nodes[node].opr.c >= ASCII_CHARS) 1174 dfa->nodes[node].mb_partial = 0; 1175 else if (dfa->nodes[node].type == OP_PERIOD) 1176 dfa->nodes[node].type = OP_UTF8_PERIOD; 1177 } 1178 1179 /* The search can be in single byte locale. */ 1180 dfa->mb_cur_max = 1; 1181 dfa->is_utf8 = 0; 1182 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1183} 1184#endif 1185 1186/* Analyze the structure tree, and calculate "first", "next", "edest", 1187 "eclosure", and "inveclosure". */ 1188 1189static reg_errcode_t 1190analyze (regex_t *preg) 1191{ 1192 re_dfa_t *dfa = preg->buffer; 1193 reg_errcode_t ret; 1194 1195 /* Allocate arrays. */ 1196 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1197 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1198 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1199 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1200 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1201 || dfa->eclosures == NULL, 0)) 1202 return REG_ESPACE; 1203 1204 dfa->subexp_map = re_malloc (Idx, preg->re_nsub); 1205 if (dfa->subexp_map != NULL) 1206 { 1207 Idx i; 1208 for (i = 0; i < preg->re_nsub; i++) 1209 dfa->subexp_map[i] = i; 1210 preorder (dfa->str_tree, optimize_subexps, dfa); 1211 for (i = 0; i < preg->re_nsub; i++) 1212 if (dfa->subexp_map[i] != i) 1213 break; 1214 if (i == preg->re_nsub) 1215 { 1216 free (dfa->subexp_map); 1217 dfa->subexp_map = NULL; 1218 } 1219 } 1220 1221 ret = postorder (dfa->str_tree, lower_subexps, preg); 1222 if (BE (ret != REG_NOERROR, 0)) 1223 return ret; 1224 ret = postorder (dfa->str_tree, calc_first, dfa); 1225 if (BE (ret != REG_NOERROR, 0)) 1226 return ret; 1227 preorder (dfa->str_tree, calc_next, dfa); 1228 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1229 if (BE (ret != REG_NOERROR, 0)) 1230 return ret; 1231 ret = calc_eclosure (dfa); 1232 if (BE (ret != REG_NOERROR, 0)) 1233 return ret; 1234 1235 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1236 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1237 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1238 || dfa->nbackref) 1239 { 1240 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1241 if (BE (dfa->inveclosures == NULL, 0)) 1242 return REG_ESPACE; 1243 ret = calc_inveclosure (dfa); 1244 } 1245 1246 return ret; 1247} 1248 1249/* Our parse trees are very unbalanced, so we cannot use a stack to 1250 implement parse tree visits. Instead, we use parent pointers and 1251 some hairy code in these two functions. */ 1252static reg_errcode_t 1253postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1254 void *extra) 1255{ 1256 bin_tree_t *node, *prev; 1257 1258 for (node = root; ; ) 1259 { 1260 /* Descend down the tree, preferably to the left (or to the right 1261 if that's the only child). */ 1262 while (node->left || node->right) 1263 if (node->left) 1264 node = node->left; 1265 else 1266 node = node->right; 1267 1268 do 1269 { 1270 reg_errcode_t err = fn (extra, node); 1271 if (BE (err != REG_NOERROR, 0)) 1272 return err; 1273 if (node->parent == NULL) 1274 return REG_NOERROR; 1275 prev = node; 1276 node = node->parent; 1277 } 1278 /* Go up while we have a node that is reached from the right. */ 1279 while (node->right == prev || node->right == NULL); 1280 node = node->right; 1281 } 1282} 1283 1284static reg_errcode_t 1285preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1286 void *extra) 1287{ 1288 bin_tree_t *node; 1289 1290 for (node = root; ; ) 1291 { 1292 reg_errcode_t err = fn (extra, node); 1293 if (BE (err != REG_NOERROR, 0)) 1294 return err; 1295 1296 /* Go to the left node, or up and to the right. */ 1297 if (node->left) 1298 node = node->left; 1299 else 1300 { 1301 bin_tree_t *prev = NULL; 1302 while (node->right == prev || node->right == NULL) 1303 { 1304 prev = node; 1305 node = node->parent; 1306 if (!node) 1307 return REG_NOERROR; 1308 } 1309 node = node->right; 1310 } 1311 } 1312} 1313 1314/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1315 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1316 backreferences as well. Requires a preorder visit. */ 1317static reg_errcode_t 1318optimize_subexps (void *extra, bin_tree_t *node) 1319{ 1320 re_dfa_t *dfa = (re_dfa_t *) extra; 1321 1322 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1323 { 1324 int idx = node->token.opr.idx; 1325 node->token.opr.idx = dfa->subexp_map[idx]; 1326 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1327 } 1328 1329 else if (node->token.type == SUBEXP 1330 && node->left && node->left->token.type == SUBEXP) 1331 { 1332 Idx other_idx = node->left->token.opr.idx; 1333 1334 node->left = node->left->left; 1335 if (node->left) 1336 node->left->parent = node; 1337 1338 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1339 if (other_idx < BITSET_WORD_BITS) 1340 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1341 } 1342 1343 return REG_NOERROR; 1344} 1345 1346/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1347 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1348static reg_errcode_t 1349lower_subexps (void *extra, bin_tree_t *node) 1350{ 1351 regex_t *preg = (regex_t *) extra; 1352 reg_errcode_t err = REG_NOERROR; 1353 1354 if (node->left && node->left->token.type == SUBEXP) 1355 { 1356 node->left = lower_subexp (&err, preg, node->left); 1357 if (node->left) 1358 node->left->parent = node; 1359 } 1360 if (node->right && node->right->token.type == SUBEXP) 1361 { 1362 node->right = lower_subexp (&err, preg, node->right); 1363 if (node->right) 1364 node->right->parent = node; 1365 } 1366 1367 return err; 1368} 1369 1370static bin_tree_t * 1371lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1372{ 1373 re_dfa_t *dfa = preg->buffer; 1374 bin_tree_t *body = node->left; 1375 bin_tree_t *op, *cls, *tree1, *tree; 1376 1377 if (preg->no_sub 1378 /* We do not optimize empty subexpressions, because otherwise we may 1379 have bad CONCAT nodes with NULL children. This is obviously not 1380 very common, so we do not lose much. An example that triggers 1381 this case is the sed "script" /\(\)/x. */ 1382 && node->left != NULL 1383 && (node->token.opr.idx >= BITSET_WORD_BITS 1384 || !(dfa->used_bkref_map 1385 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1386 return node->left; 1387 1388 /* Convert the SUBEXP node to the concatenation of an 1389 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1390 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1391 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1392 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1393 tree = create_tree (dfa, op, tree1, CONCAT); 1394 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1395 { 1396 *err = REG_ESPACE; 1397 return NULL; 1398 } 1399 1400 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1401 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1402 return tree; 1403} 1404 1405/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1406 nodes. Requires a postorder visit. */ 1407static reg_errcode_t 1408calc_first (void *extra, bin_tree_t *node) 1409{ 1410 re_dfa_t *dfa = (re_dfa_t *) extra; 1411 if (node->token.type == CONCAT) 1412 { 1413 node->first = node->left->first; 1414 node->node_idx = node->left->node_idx; 1415 } 1416 else 1417 { 1418 node->first = node; 1419 node->node_idx = re_dfa_add_node (dfa, node->token); 1420 if (BE (node->node_idx == REG_MISSING, 0)) 1421 return REG_ESPACE; 1422 if (node->token.type == ANCHOR) 1423 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1424 } 1425 return REG_NOERROR; 1426} 1427 1428/* Pass 2: compute NEXT on the tree. Preorder visit. */ 1429static reg_errcode_t 1430calc_next (void *extra, bin_tree_t *node) 1431{ 1432 switch (node->token.type) 1433 { 1434 case OP_DUP_ASTERISK: 1435 node->left->next = node; 1436 break; 1437 case CONCAT: 1438 node->left->next = node->right->first; 1439 node->right->next = node->next; 1440 break; 1441 default: 1442 if (node->left) 1443 node->left->next = node->next; 1444 if (node->right) 1445 node->right->next = node->next; 1446 break; 1447 } 1448 return REG_NOERROR; 1449} 1450 1451/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1452static reg_errcode_t 1453link_nfa_nodes (void *extra, bin_tree_t *node) 1454{ 1455 re_dfa_t *dfa = (re_dfa_t *) extra; 1456 Idx idx = node->node_idx; 1457 reg_errcode_t err = REG_NOERROR; 1458 1459 switch (node->token.type) 1460 { 1461 case CONCAT: 1462 break; 1463 1464 case END_OF_RE: 1465 assert (node->next == NULL); 1466 break; 1467 1468 case OP_DUP_ASTERISK: 1469 case OP_ALT: 1470 { 1471 Idx left, right; 1472 dfa->has_plural_match = 1; 1473 if (node->left != NULL) 1474 left = node->left->first->node_idx; 1475 else 1476 left = node->next->node_idx; 1477 if (node->right != NULL) 1478 right = node->right->first->node_idx; 1479 else 1480 right = node->next->node_idx; 1481 assert (REG_VALID_INDEX (left)); 1482 assert (REG_VALID_INDEX (right)); 1483 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1484 } 1485 break; 1486 1487 case ANCHOR: 1488 case OP_OPEN_SUBEXP: 1489 case OP_CLOSE_SUBEXP: 1490 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1491 break; 1492 1493 case OP_BACK_REF: 1494 dfa->nexts[idx] = node->next->node_idx; 1495 if (node->token.type == OP_BACK_REF) 1496 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1497 break; 1498 1499 default: 1500 assert (!IS_EPSILON_NODE (node->token.type)); 1501 dfa->nexts[idx] = node->next->node_idx; 1502 break; 1503 } 1504 1505 return err; 1506} 1507 1508/* Duplicate the epsilon closure of the node ROOT_NODE. 1509 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1510 to their own constraint. */ 1511 1512static reg_errcode_t 1513internal_function 1514duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, 1515 Idx root_node, unsigned int init_constraint) 1516{ 1517 Idx org_node, clone_node; 1518 bool ok; 1519 unsigned int constraint = init_constraint; 1520 for (org_node = top_org_node, clone_node = top_clone_node;;) 1521 { 1522 Idx org_dest, clone_dest; 1523 if (dfa->nodes[org_node].type == OP_BACK_REF) 1524 { 1525 /* If the back reference epsilon-transit, its destination must 1526 also have the constraint. Then duplicate the epsilon closure 1527 of the destination of the back reference, and store it in 1528 edests of the back reference. */ 1529 org_dest = dfa->nexts[org_node]; 1530 re_node_set_empty (dfa->edests + clone_node); 1531 clone_dest = duplicate_node (dfa, org_dest, constraint); 1532 if (BE (clone_dest == REG_MISSING, 0)) 1533 return REG_ESPACE; 1534 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1535 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1536 if (BE (! ok, 0)) 1537 return REG_ESPACE; 1538 } 1539 else if (dfa->edests[org_node].nelem == 0) 1540 { 1541 /* In case of the node can't epsilon-transit, don't duplicate the 1542 destination and store the original destination as the 1543 destination of the node. */ 1544 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1545 break; 1546 } 1547 else if (dfa->edests[org_node].nelem == 1) 1548 { 1549 /* In case of the node can epsilon-transit, and it has only one 1550 destination. */ 1551 org_dest = dfa->edests[org_node].elems[0]; 1552 re_node_set_empty (dfa->edests + clone_node); 1553 /* If the node is root_node itself, it means the epsilon closure 1554 has a loop. Then tie it to the destination of the root_node. */ 1555 if (org_node == root_node && clone_node != org_node) 1556 { 1557 ok = re_node_set_insert (dfa->edests + clone_node, org_dest); 1558 if (BE (! ok, 0)) 1559 return REG_ESPACE; 1560 break; 1561 } 1562 /* In case the node has another constraint, append it. */ 1563 constraint |= dfa->nodes[org_node].constraint; 1564 clone_dest = duplicate_node (dfa, org_dest, constraint); 1565 if (BE (clone_dest == REG_MISSING, 0)) 1566 return REG_ESPACE; 1567 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1568 if (BE (! ok, 0)) 1569 return REG_ESPACE; 1570 } 1571 else /* dfa->edests[org_node].nelem == 2 */ 1572 { 1573 /* In case of the node can epsilon-transit, and it has two 1574 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1575 org_dest = dfa->edests[org_node].elems[0]; 1576 re_node_set_empty (dfa->edests + clone_node); 1577 /* Search for a duplicated node which satisfies the constraint. */ 1578 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1579 if (clone_dest == REG_MISSING) 1580 { 1581 /* There is no such duplicated node, create a new one. */ 1582 reg_errcode_t err; 1583 clone_dest = duplicate_node (dfa, org_dest, constraint); 1584 if (BE (clone_dest == REG_MISSING, 0)) 1585 return REG_ESPACE; 1586 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1587 if (BE (! ok, 0)) 1588 return REG_ESPACE; 1589 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1590 root_node, constraint); 1591 if (BE (err != REG_NOERROR, 0)) 1592 return err; 1593 } 1594 else 1595 { 1596 /* There is a duplicated node which satisfies the constraint, 1597 use it to avoid infinite loop. */ 1598 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1599 if (BE (! ok, 0)) 1600 return REG_ESPACE; 1601 } 1602 1603 org_dest = dfa->edests[org_node].elems[1]; 1604 clone_dest = duplicate_node (dfa, org_dest, constraint); 1605 if (BE (clone_dest == REG_MISSING, 0)) 1606 return REG_ESPACE; 1607 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1608 if (BE (! ok, 0)) 1609 return REG_ESPACE; 1610 } 1611 org_node = org_dest; 1612 clone_node = clone_dest; 1613 } 1614 return REG_NOERROR; 1615} 1616 1617/* Search for a node which is duplicated from the node ORG_NODE, and 1618 satisfies the constraint CONSTRAINT. */ 1619 1620static Idx 1621search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1622 unsigned int constraint) 1623{ 1624 Idx idx; 1625 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1626 { 1627 if (org_node == dfa->org_indices[idx] 1628 && constraint == dfa->nodes[idx].constraint) 1629 return idx; /* Found. */ 1630 } 1631 return REG_MISSING; /* Not found. */ 1632} 1633 1634/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1635 Return the index of the new node, or REG_MISSING if insufficient storage is 1636 available. */ 1637 1638static Idx 1639duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1640{ 1641 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1642 if (BE (dup_idx != REG_MISSING, 1)) 1643 { 1644 dfa->nodes[dup_idx].constraint = constraint; 1645 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1646 dfa->nodes[dup_idx].duplicated = 1; 1647 1648 /* Store the index of the original node. */ 1649 dfa->org_indices[dup_idx] = org_idx; 1650 } 1651 return dup_idx; 1652} 1653 1654static reg_errcode_t 1655calc_inveclosure (re_dfa_t *dfa) 1656{ 1657 Idx src, idx; 1658 bool ok; 1659 for (idx = 0; idx < dfa->nodes_len; ++idx) 1660 re_node_set_init_empty (dfa->inveclosures + idx); 1661 1662 for (src = 0; src < dfa->nodes_len; ++src) 1663 { 1664 Idx *elems = dfa->eclosures[src].elems; 1665 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1666 { 1667 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1668 if (BE (! ok, 0)) 1669 return REG_ESPACE; 1670 } 1671 } 1672 1673 return REG_NOERROR; 1674} 1675 1676/* Calculate "eclosure" for all the node in DFA. */ 1677 1678static reg_errcode_t 1679calc_eclosure (re_dfa_t *dfa) 1680{ 1681 Idx node_idx; 1682 bool incomplete; 1683#ifdef DEBUG 1684 assert (dfa->nodes_len > 0); 1685#endif 1686 incomplete = false; 1687 /* For each nodes, calculate epsilon closure. */ 1688 for (node_idx = 0; ; ++node_idx) 1689 { 1690 reg_errcode_t err; 1691 re_node_set eclosure_elem; 1692 if (node_idx == dfa->nodes_len) 1693 { 1694 if (!incomplete) 1695 break; 1696 incomplete = false; 1697 node_idx = 0; 1698 } 1699 1700#ifdef DEBUG 1701 assert (dfa->eclosures[node_idx].nelem != REG_MISSING); 1702#endif 1703 1704 /* If we have already calculated, skip it. */ 1705 if (dfa->eclosures[node_idx].nelem != 0) 1706 continue; 1707 /* Calculate epsilon closure of 'node_idx'. */ 1708 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1709 if (BE (err != REG_NOERROR, 0)) 1710 return err; 1711 1712 if (dfa->eclosures[node_idx].nelem == 0) 1713 { 1714 incomplete = true; 1715 re_node_set_free (&eclosure_elem); 1716 } 1717 } 1718 return REG_NOERROR; 1719} 1720 1721/* Calculate epsilon closure of NODE. */ 1722 1723static reg_errcode_t 1724calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1725{ 1726 reg_errcode_t err; 1727 Idx i; 1728 re_node_set eclosure; 1729 bool ok; 1730 bool incomplete = false; 1731 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1732 if (BE (err != REG_NOERROR, 0)) 1733 return err; 1734 1735 /* This indicates that we are calculating this node now. 1736 We reference this value to avoid infinite loop. */ 1737 dfa->eclosures[node].nelem = REG_MISSING; 1738 1739 /* If the current node has constraints, duplicate all nodes 1740 since they must inherit the constraints. */ 1741 if (dfa->nodes[node].constraint 1742 && dfa->edests[node].nelem 1743 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1744 { 1745 err = duplicate_node_closure (dfa, node, node, node, 1746 dfa->nodes[node].constraint); 1747 if (BE (err != REG_NOERROR, 0)) 1748 return err; 1749 } 1750 1751 /* Expand each epsilon destination nodes. */ 1752 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1753 for (i = 0; i < dfa->edests[node].nelem; ++i) 1754 { 1755 re_node_set eclosure_elem; 1756 Idx edest = dfa->edests[node].elems[i]; 1757 /* If calculating the epsilon closure of 'edest' is in progress, 1758 return intermediate result. */ 1759 if (dfa->eclosures[edest].nelem == REG_MISSING) 1760 { 1761 incomplete = true; 1762 continue; 1763 } 1764 /* If we haven't calculated the epsilon closure of 'edest' yet, 1765 calculate now. Otherwise use calculated epsilon closure. */ 1766 if (dfa->eclosures[edest].nelem == 0) 1767 { 1768 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1769 if (BE (err != REG_NOERROR, 0)) 1770 return err; 1771 } 1772 else 1773 eclosure_elem = dfa->eclosures[edest]; 1774 /* Merge the epsilon closure of 'edest'. */ 1775 err = re_node_set_merge (&eclosure, &eclosure_elem); 1776 if (BE (err != REG_NOERROR, 0)) 1777 return err; 1778 /* If the epsilon closure of 'edest' is incomplete, 1779 the epsilon closure of this node is also incomplete. */ 1780 if (dfa->eclosures[edest].nelem == 0) 1781 { 1782 incomplete = true; 1783 re_node_set_free (&eclosure_elem); 1784 } 1785 } 1786 1787 /* An epsilon closure includes itself. */ 1788 ok = re_node_set_insert (&eclosure, node); 1789 if (BE (! ok, 0)) 1790 return REG_ESPACE; 1791 if (incomplete && !root) 1792 dfa->eclosures[node].nelem = 0; 1793 else 1794 dfa->eclosures[node] = eclosure; 1795 *new_set = eclosure; 1796 return REG_NOERROR; 1797} 1798 1799/* Functions for token which are used in the parser. */ 1800 1801/* Fetch a token from INPUT. 1802 We must not use this function inside bracket expressions. */ 1803 1804static void 1805internal_function 1806fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1807{ 1808 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1809} 1810 1811/* Peek a token from INPUT, and return the length of the token. 1812 We must not use this function inside bracket expressions. */ 1813 1814static int 1815internal_function 1816peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1817{ 1818 unsigned char c; 1819 1820 if (re_string_eoi (input)) 1821 { 1822 token->type = END_OF_RE; 1823 return 0; 1824 } 1825 1826 c = re_string_peek_byte (input, 0); 1827 token->opr.c = c; 1828 1829 token->word_char = 0; 1830#ifdef RE_ENABLE_I18N 1831 token->mb_partial = 0; 1832 if (input->mb_cur_max > 1 && 1833 !re_string_first_byte (input, re_string_cur_idx (input))) 1834 { 1835 token->type = CHARACTER; 1836 token->mb_partial = 1; 1837 return 1; 1838 } 1839#endif 1840 if (c == '\\') 1841 { 1842 unsigned char c2; 1843 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1844 { 1845 token->type = BACK_SLASH; 1846 return 1; 1847 } 1848 1849 c2 = re_string_peek_byte_case (input, 1); 1850 token->opr.c = c2; 1851 token->type = CHARACTER; 1852#ifdef RE_ENABLE_I18N 1853 if (input->mb_cur_max > 1) 1854 { 1855 wint_t wc = re_string_wchar_at (input, 1856 re_string_cur_idx (input) + 1); 1857 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1858 } 1859 else 1860#endif 1861 token->word_char = IS_WORD_CHAR (c2) != 0; 1862 1863 switch (c2) 1864 { 1865 case '|': 1866 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1867 token->type = OP_ALT; 1868 break; 1869 case '1': case '2': case '3': case '4': case '5': 1870 case '6': case '7': case '8': case '9': 1871 if (!(syntax & RE_NO_BK_REFS)) 1872 { 1873 token->type = OP_BACK_REF; 1874 token->opr.idx = c2 - '1'; 1875 } 1876 break; 1877 case '<': 1878 if (!(syntax & RE_NO_GNU_OPS)) 1879 { 1880 token->type = ANCHOR; 1881 token->opr.ctx_type = WORD_FIRST; 1882 } 1883 break; 1884 case '>': 1885 if (!(syntax & RE_NO_GNU_OPS)) 1886 { 1887 token->type = ANCHOR; 1888 token->opr.ctx_type = WORD_LAST; 1889 } 1890 break; 1891 case 'b': 1892 if (!(syntax & RE_NO_GNU_OPS)) 1893 { 1894 token->type = ANCHOR; 1895 token->opr.ctx_type = WORD_DELIM; 1896 } 1897 break; 1898 case 'B': 1899 if (!(syntax & RE_NO_GNU_OPS)) 1900 { 1901 token->type = ANCHOR; 1902 token->opr.ctx_type = NOT_WORD_DELIM; 1903 } 1904 break; 1905 case 'w': 1906 if (!(syntax & RE_NO_GNU_OPS)) 1907 token->type = OP_WORD; 1908 break; 1909 case 'W': 1910 if (!(syntax & RE_NO_GNU_OPS)) 1911 token->type = OP_NOTWORD; 1912 break; 1913 case 's': 1914 if (!(syntax & RE_NO_GNU_OPS)) 1915 token->type = OP_SPACE; 1916 break; 1917 case 'S': 1918 if (!(syntax & RE_NO_GNU_OPS)) 1919 token->type = OP_NOTSPACE; 1920 break; 1921 case '`': 1922 if (!(syntax & RE_NO_GNU_OPS)) 1923 { 1924 token->type = ANCHOR; 1925 token->opr.ctx_type = BUF_FIRST; 1926 } 1927 break; 1928 case '\'': 1929 if (!(syntax & RE_NO_GNU_OPS)) 1930 { 1931 token->type = ANCHOR; 1932 token->opr.ctx_type = BUF_LAST; 1933 } 1934 break; 1935 case '(': 1936 if (!(syntax & RE_NO_BK_PARENS)) 1937 token->type = OP_OPEN_SUBEXP; 1938 break; 1939 case ')': 1940 if (!(syntax & RE_NO_BK_PARENS)) 1941 token->type = OP_CLOSE_SUBEXP; 1942 break; 1943 case '+': 1944 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1945 token->type = OP_DUP_PLUS; 1946 break; 1947 case '?': 1948 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1949 token->type = OP_DUP_QUESTION; 1950 break; 1951 case '{': 1952 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1953 token->type = OP_OPEN_DUP_NUM; 1954 break; 1955 case '}': 1956 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1957 token->type = OP_CLOSE_DUP_NUM; 1958 break; 1959 default: 1960 break; 1961 } 1962 return 2; 1963 } 1964 1965 token->type = CHARACTER; 1966#ifdef RE_ENABLE_I18N 1967 if (input->mb_cur_max > 1) 1968 { 1969 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1970 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1971 } 1972 else 1973#endif 1974 token->word_char = IS_WORD_CHAR (token->opr.c); 1975 1976 switch (c) 1977 { 1978 case '\n': 1979 if (syntax & RE_NEWLINE_ALT) 1980 token->type = OP_ALT; 1981 break; 1982 case '|': 1983 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1984 token->type = OP_ALT; 1985 break; 1986 case '*': 1987 token->type = OP_DUP_ASTERISK; 1988 break; 1989 case '+': 1990 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1991 token->type = OP_DUP_PLUS; 1992 break; 1993 case '?': 1994 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1995 token->type = OP_DUP_QUESTION; 1996 break; 1997 case '{': 1998 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1999 token->type = OP_OPEN_DUP_NUM; 2000 break; 2001 case '}': 2002 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 2003 token->type = OP_CLOSE_DUP_NUM; 2004 break; 2005 case '(': 2006 if (syntax & RE_NO_BK_PARENS) 2007 token->type = OP_OPEN_SUBEXP; 2008 break; 2009 case ')': 2010 if (syntax & RE_NO_BK_PARENS) 2011 token->type = OP_CLOSE_SUBEXP; 2012 break; 2013 case '[': 2014 token->type = OP_OPEN_BRACKET; 2015 break; 2016 case '.': 2017 token->type = OP_PERIOD; 2018 break; 2019 case '^': 2020 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 2021 re_string_cur_idx (input) != 0) 2022 { 2023 char prev = re_string_peek_byte (input, -1); 2024 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 2025 break; 2026 } 2027 token->type = ANCHOR; 2028 token->opr.ctx_type = LINE_FIRST; 2029 break; 2030 case '$': 2031 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 2032 re_string_cur_idx (input) + 1 != re_string_length (input)) 2033 { 2034 re_token_t next; 2035 re_string_skip_bytes (input, 1); 2036 peek_token (&next, input, syntax); 2037 re_string_skip_bytes (input, -1); 2038 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 2039 break; 2040 } 2041 token->type = ANCHOR; 2042 token->opr.ctx_type = LINE_LAST; 2043 break; 2044 default: 2045 break; 2046 } 2047 return 1; 2048} 2049 2050/* Peek a token from INPUT, and return the length of the token. 2051 We must not use this function out of bracket expressions. */ 2052 2053static int 2054internal_function 2055peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2056{ 2057 unsigned char c; 2058 if (re_string_eoi (input)) 2059 { 2060 token->type = END_OF_RE; 2061 return 0; 2062 } 2063 c = re_string_peek_byte (input, 0); 2064 token->opr.c = c; 2065 2066#ifdef RE_ENABLE_I18N 2067 if (input->mb_cur_max > 1 && 2068 !re_string_first_byte (input, re_string_cur_idx (input))) 2069 { 2070 token->type = CHARACTER; 2071 return 1; 2072 } 2073#endif /* RE_ENABLE_I18N */ 2074 2075 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2076 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2077 { 2078 /* In this case, '\' escape a character. */ 2079 unsigned char c2; 2080 re_string_skip_bytes (input, 1); 2081 c2 = re_string_peek_byte (input, 0); 2082 token->opr.c = c2; 2083 token->type = CHARACTER; 2084 return 1; 2085 } 2086 if (c == '[') /* '[' is a special char in a bracket exps. */ 2087 { 2088 unsigned char c2; 2089 int token_len; 2090 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2091 c2 = re_string_peek_byte (input, 1); 2092 else 2093 c2 = 0; 2094 token->opr.c = c2; 2095 token_len = 2; 2096 switch (c2) 2097 { 2098 case '.': 2099 token->type = OP_OPEN_COLL_ELEM; 2100 break; 2101 case '=': 2102 token->type = OP_OPEN_EQUIV_CLASS; 2103 break; 2104 case ':': 2105 if (syntax & RE_CHAR_CLASSES) 2106 { 2107 token->type = OP_OPEN_CHAR_CLASS; 2108 break; 2109 } 2110 /* else fall through. */ 2111 default: 2112 token->type = CHARACTER; 2113 token->opr.c = c; 2114 token_len = 1; 2115 break; 2116 } 2117 return token_len; 2118 } 2119 switch (c) 2120 { 2121 case '-': 2122 token->type = OP_CHARSET_RANGE; 2123 break; 2124 case ']': 2125 token->type = OP_CLOSE_BRACKET; 2126 break; 2127 case '^': 2128 token->type = OP_NON_MATCH_LIST; 2129 break; 2130 default: 2131 token->type = CHARACTER; 2132 } 2133 return 1; 2134} 2135 2136/* Functions for parser. */ 2137 2138/* Entry point of the parser. 2139 Parse the regular expression REGEXP and return the structure tree. 2140 If an error occurs, ERR is set by error code, and return NULL. 2141 This function build the following tree, from regular expression <reg_exp>: 2142 CAT 2143 / \ 2144 / \ 2145 <reg_exp> EOR 2146 2147 CAT means concatenation. 2148 EOR means end of regular expression. */ 2149 2150static bin_tree_t * 2151parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2152 reg_errcode_t *err) 2153{ 2154 re_dfa_t *dfa = preg->buffer; 2155 bin_tree_t *tree, *eor, *root; 2156 re_token_t current_token; 2157 dfa->syntax = syntax; 2158 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2159 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2160 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2161 return NULL; 2162 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2163 if (tree != NULL) 2164 root = create_tree (dfa, tree, eor, CONCAT); 2165 else 2166 root = eor; 2167 if (BE (eor == NULL || root == NULL, 0)) 2168 { 2169 *err = REG_ESPACE; 2170 return NULL; 2171 } 2172 return root; 2173} 2174 2175/* This function build the following tree, from regular expression 2176 <branch1>|<branch2>: 2177 ALT 2178 / \ 2179 / \ 2180 <branch1> <branch2> 2181 2182 ALT means alternative, which represents the operator '|'. */ 2183 2184static bin_tree_t * 2185parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2186 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2187{ 2188 re_dfa_t *dfa = preg->buffer; 2189 bin_tree_t *tree, *branch = NULL; 2190 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2191 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2192 return NULL; 2193 2194 while (token->type == OP_ALT) 2195 { 2196 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2197 if (token->type != OP_ALT && token->type != END_OF_RE 2198 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2199 { 2200 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2201 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2202 return NULL; 2203 } 2204 else 2205 branch = NULL; 2206 tree = create_tree (dfa, tree, branch, OP_ALT); 2207 if (BE (tree == NULL, 0)) 2208 { 2209 *err = REG_ESPACE; 2210 return NULL; 2211 } 2212 } 2213 return tree; 2214} 2215 2216/* This function build the following tree, from regular expression 2217 <exp1><exp2>: 2218 CAT 2219 / \ 2220 / \ 2221 <exp1> <exp2> 2222 2223 CAT means concatenation. */ 2224 2225static bin_tree_t * 2226parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2227 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2228{ 2229 bin_tree_t *tree, *expr; 2230 re_dfa_t *dfa = preg->buffer; 2231 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2232 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2233 return NULL; 2234 2235 while (token->type != OP_ALT && token->type != END_OF_RE 2236 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2237 { 2238 expr = parse_expression (regexp, preg, token, syntax, nest, err); 2239 if (BE (*err != REG_NOERROR && expr == NULL, 0)) 2240 { 2241 if (tree != NULL) 2242 postorder (tree, free_tree, NULL); 2243 return NULL; 2244 } 2245 if (tree != NULL && expr != NULL) 2246 { 2247 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT); 2248 if (newtree == NULL) 2249 { 2250 postorder (expr, free_tree, NULL); 2251 postorder (tree, free_tree, NULL); 2252 *err = REG_ESPACE; 2253 return NULL; 2254 } 2255 tree = newtree; 2256 } 2257 else if (tree == NULL) 2258 tree = expr; 2259 /* Otherwise expr == NULL, we don't need to create new tree. */ 2260 } 2261 return tree; 2262} 2263 2264/* This function build the following tree, from regular expression a*: 2265 * 2266 | 2267 a 2268*/ 2269 2270static bin_tree_t * 2271parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2272 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2273{ 2274 re_dfa_t *dfa = preg->buffer; 2275 bin_tree_t *tree; 2276 switch (token->type) 2277 { 2278 case CHARACTER: 2279 tree = create_token_tree (dfa, NULL, NULL, token); 2280 if (BE (tree == NULL, 0)) 2281 { 2282 *err = REG_ESPACE; 2283 return NULL; 2284 } 2285#ifdef RE_ENABLE_I18N 2286 if (dfa->mb_cur_max > 1) 2287 { 2288 while (!re_string_eoi (regexp) 2289 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2290 { 2291 bin_tree_t *mbc_remain; 2292 fetch_token (token, regexp, syntax); 2293 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2294 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2295 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2296 { 2297 *err = REG_ESPACE; 2298 return NULL; 2299 } 2300 } 2301 } 2302#endif 2303 break; 2304 case OP_OPEN_SUBEXP: 2305 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2306 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2307 return NULL; 2308 break; 2309 case OP_OPEN_BRACKET: 2310 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2311 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2312 return NULL; 2313 break; 2314 case OP_BACK_REF: 2315 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2316 { 2317 *err = REG_ESUBREG; 2318 return NULL; 2319 } 2320 dfa->used_bkref_map |= 1 << token->opr.idx; 2321 tree = create_token_tree (dfa, NULL, NULL, token); 2322 if (BE (tree == NULL, 0)) 2323 { 2324 *err = REG_ESPACE; 2325 return NULL; 2326 } 2327 ++dfa->nbackref; 2328 dfa->has_mb_node = 1; 2329 break; 2330 case OP_OPEN_DUP_NUM: 2331 if (syntax & RE_CONTEXT_INVALID_DUP) 2332 { 2333 *err = REG_BADRPT; 2334 return NULL; 2335 } 2336 /* FALLTHROUGH */ 2337 case OP_DUP_ASTERISK: 2338 case OP_DUP_PLUS: 2339 case OP_DUP_QUESTION: 2340 if (syntax & RE_CONTEXT_INVALID_OPS) 2341 { 2342 *err = REG_BADRPT; 2343 return NULL; 2344 } 2345 else if (syntax & RE_CONTEXT_INDEP_OPS) 2346 { 2347 fetch_token (token, regexp, syntax); 2348 return parse_expression (regexp, preg, token, syntax, nest, err); 2349 } 2350 /* else fall through */ 2351 case OP_CLOSE_SUBEXP: 2352 if ((token->type == OP_CLOSE_SUBEXP) && 2353 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2354 { 2355 *err = REG_ERPAREN; 2356 return NULL; 2357 } 2358 /* else fall through */ 2359 case OP_CLOSE_DUP_NUM: 2360 /* We treat it as a normal character. */ 2361 2362 /* Then we can these characters as normal characters. */ 2363 token->type = CHARACTER; 2364 /* mb_partial and word_char bits should be initialized already 2365 by peek_token. */ 2366 tree = create_token_tree (dfa, NULL, NULL, token); 2367 if (BE (tree == NULL, 0)) 2368 { 2369 *err = REG_ESPACE; 2370 return NULL; 2371 } 2372 break; 2373 case ANCHOR: 2374 if ((token->opr.ctx_type 2375 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2376 && dfa->word_ops_used == 0) 2377 init_word_char (dfa); 2378 if (token->opr.ctx_type == WORD_DELIM 2379 || token->opr.ctx_type == NOT_WORD_DELIM) 2380 { 2381 bin_tree_t *tree_first, *tree_last; 2382 if (token->opr.ctx_type == WORD_DELIM) 2383 { 2384 token->opr.ctx_type = WORD_FIRST; 2385 tree_first = create_token_tree (dfa, NULL, NULL, token); 2386 token->opr.ctx_type = WORD_LAST; 2387 } 2388 else 2389 { 2390 token->opr.ctx_type = INSIDE_WORD; 2391 tree_first = create_token_tree (dfa, NULL, NULL, token); 2392 token->opr.ctx_type = INSIDE_NOTWORD; 2393 } 2394 tree_last = create_token_tree (dfa, NULL, NULL, token); 2395 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2396 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2397 { 2398 *err = REG_ESPACE; 2399 return NULL; 2400 } 2401 } 2402 else 2403 { 2404 tree = create_token_tree (dfa, NULL, NULL, token); 2405 if (BE (tree == NULL, 0)) 2406 { 2407 *err = REG_ESPACE; 2408 return NULL; 2409 } 2410 } 2411 /* We must return here, since ANCHORs can't be followed 2412 by repetition operators. 2413 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2414 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2415 fetch_token (token, regexp, syntax); 2416 return tree; 2417 case OP_PERIOD: 2418 tree = create_token_tree (dfa, NULL, NULL, token); 2419 if (BE (tree == NULL, 0)) 2420 { 2421 *err = REG_ESPACE; 2422 return NULL; 2423 } 2424 if (dfa->mb_cur_max > 1) 2425 dfa->has_mb_node = 1; 2426 break; 2427 case OP_WORD: 2428 case OP_NOTWORD: 2429 tree = build_charclass_op (dfa, regexp->trans, 2430 "alnum", 2431 "_", 2432 token->type == OP_NOTWORD, err); 2433 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2434 return NULL; 2435 break; 2436 case OP_SPACE: 2437 case OP_NOTSPACE: 2438 tree = build_charclass_op (dfa, regexp->trans, 2439 "space", 2440 "", 2441 token->type == OP_NOTSPACE, err); 2442 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2443 return NULL; 2444 break; 2445 case OP_ALT: 2446 case END_OF_RE: 2447 return NULL; 2448 case BACK_SLASH: 2449 *err = REG_EESCAPE; 2450 return NULL; 2451 default: 2452 /* Must not happen? */ 2453#ifdef DEBUG 2454 assert (0); 2455#endif 2456 return NULL; 2457 } 2458 fetch_token (token, regexp, syntax); 2459 2460 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2461 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2462 { 2463 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2464 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2465 return NULL; 2466 /* In BRE consecutive duplications are not allowed. */ 2467 if ((syntax & RE_CONTEXT_INVALID_DUP) 2468 && (token->type == OP_DUP_ASTERISK 2469 || token->type == OP_OPEN_DUP_NUM)) 2470 { 2471 *err = REG_BADRPT; 2472 return NULL; 2473 } 2474 } 2475 2476 return tree; 2477} 2478 2479/* This function build the following tree, from regular expression 2480 (<reg_exp>): 2481 SUBEXP 2482 | 2483 <reg_exp> 2484*/ 2485 2486static bin_tree_t * 2487parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2488 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2489{ 2490 re_dfa_t *dfa = preg->buffer; 2491 bin_tree_t *tree; 2492 size_t cur_nsub; 2493 cur_nsub = preg->re_nsub++; 2494 2495 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2496 2497 /* The subexpression may be a null string. */ 2498 if (token->type == OP_CLOSE_SUBEXP) 2499 tree = NULL; 2500 else 2501 { 2502 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2503 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2504 { 2505 if (tree != NULL) 2506 postorder (tree, free_tree, NULL); 2507 *err = REG_EPAREN; 2508 } 2509 if (BE (*err != REG_NOERROR, 0)) 2510 return NULL; 2511 } 2512 2513 if (cur_nsub <= '9' - '1') 2514 dfa->completed_bkref_map |= 1 << cur_nsub; 2515 2516 tree = create_tree (dfa, tree, NULL, SUBEXP); 2517 if (BE (tree == NULL, 0)) 2518 { 2519 *err = REG_ESPACE; 2520 return NULL; 2521 } 2522 tree->token.opr.idx = cur_nsub; 2523 return tree; 2524} 2525 2526/* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2527 2528static bin_tree_t * 2529parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2530 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2531{ 2532 bin_tree_t *tree = NULL, *old_tree = NULL; 2533 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2534 re_token_t start_token = *token; 2535 2536 if (token->type == OP_OPEN_DUP_NUM) 2537 { 2538 end = 0; 2539 start = fetch_number (regexp, token, syntax); 2540 if (start == REG_MISSING) 2541 { 2542 if (token->type == CHARACTER && token->opr.c == ',') 2543 start = 0; /* We treat "{,m}" as "{0,m}". */ 2544 else 2545 { 2546 *err = REG_BADBR; /* <re>{} is invalid. */ 2547 return NULL; 2548 } 2549 } 2550 if (BE (start != REG_ERROR, 1)) 2551 { 2552 /* We treat "{n}" as "{n,n}". */ 2553 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2554 : ((token->type == CHARACTER && token->opr.c == ',') 2555 ? fetch_number (regexp, token, syntax) : REG_ERROR)); 2556 } 2557 if (BE (start == REG_ERROR || end == REG_ERROR, 0)) 2558 { 2559 /* Invalid sequence. */ 2560 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2561 { 2562 if (token->type == END_OF_RE) 2563 *err = REG_EBRACE; 2564 else 2565 *err = REG_BADBR; 2566 2567 return NULL; 2568 } 2569 2570 /* If the syntax bit is set, rollback. */ 2571 re_string_set_index (regexp, start_idx); 2572 *token = start_token; 2573 token->type = CHARACTER; 2574 /* mb_partial and word_char bits should be already initialized by 2575 peek_token. */ 2576 return elem; 2577 } 2578 2579 if (BE ((end != REG_MISSING && start > end) 2580 || token->type != OP_CLOSE_DUP_NUM, 0)) 2581 { 2582 /* First number greater than second. */ 2583 *err = REG_BADBR; 2584 return NULL; 2585 } 2586 2587 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0)) 2588 { 2589 *err = REG_ESIZE; 2590 return NULL; 2591 } 2592 } 2593 else 2594 { 2595 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2596 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; 2597 } 2598 2599 fetch_token (token, regexp, syntax); 2600 2601 if (BE (elem == NULL, 0)) 2602 return NULL; 2603 if (BE (start == 0 && end == 0, 0)) 2604 { 2605 postorder (elem, free_tree, NULL); 2606 return NULL; 2607 } 2608 2609 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2610 if (BE (start > 0, 0)) 2611 { 2612 tree = elem; 2613 for (i = 2; i <= start; ++i) 2614 { 2615 elem = duplicate_tree (elem, dfa); 2616 tree = create_tree (dfa, tree, elem, CONCAT); 2617 if (BE (elem == NULL || tree == NULL, 0)) 2618 goto parse_dup_op_espace; 2619 } 2620 2621 if (start == end) 2622 return tree; 2623 2624 /* Duplicate ELEM before it is marked optional. */ 2625 elem = duplicate_tree (elem, dfa); 2626 old_tree = tree; 2627 } 2628 else 2629 old_tree = NULL; 2630 2631 if (elem->token.type == SUBEXP) 2632 { 2633 uintptr_t subidx = elem->token.opr.idx; 2634 postorder (elem, mark_opt_subexp, (void *) subidx); 2635 } 2636 2637 tree = create_tree (dfa, elem, NULL, 2638 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2639 if (BE (tree == NULL, 0)) 2640 goto parse_dup_op_espace; 2641 2642/* From gnulib's "intprops.h": 2643 True if the arithmetic type T is signed. */ 2644#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) 2645 2646 /* This loop is actually executed only when end != REG_MISSING, 2647 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2648 already created the start+1-th copy. */ 2649 if (TYPE_SIGNED (Idx) || end != REG_MISSING) 2650 for (i = start + 2; i <= end; ++i) 2651 { 2652 elem = duplicate_tree (elem, dfa); 2653 tree = create_tree (dfa, tree, elem, CONCAT); 2654 if (BE (elem == NULL || tree == NULL, 0)) 2655 goto parse_dup_op_espace; 2656 2657 tree = create_tree (dfa, tree, NULL, OP_ALT); 2658 if (BE (tree == NULL, 0)) 2659 goto parse_dup_op_espace; 2660 } 2661 2662 if (old_tree) 2663 tree = create_tree (dfa, old_tree, tree, CONCAT); 2664 2665 return tree; 2666 2667 parse_dup_op_espace: 2668 *err = REG_ESPACE; 2669 return NULL; 2670} 2671 2672/* Size of the names for collating symbol/equivalence_class/character_class. 2673 I'm not sure, but maybe enough. */ 2674#define BRACKET_NAME_BUF_SIZE 32 2675 2676#ifndef _LIBC 2677 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2678 Build the range expression which starts from START_ELEM, and ends 2679 at END_ELEM. The result are written to MBCSET and SBCSET. 2680 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2681 mbcset->range_ends, is a pointer argument since we may 2682 update it. */ 2683 2684static reg_errcode_t 2685internal_function 2686# ifdef RE_ENABLE_I18N 2687build_range_exp (const reg_syntax_t syntax, 2688 bitset_t sbcset, 2689 re_charset_t *mbcset, 2690 Idx *range_alloc, 2691 const bracket_elem_t *start_elem, 2692 const bracket_elem_t *end_elem) 2693# else /* not RE_ENABLE_I18N */ 2694build_range_exp (const reg_syntax_t syntax, 2695 bitset_t sbcset, 2696 const bracket_elem_t *start_elem, 2697 const bracket_elem_t *end_elem) 2698# endif /* not RE_ENABLE_I18N */ 2699{ 2700 unsigned int start_ch, end_ch; 2701 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2702 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2703 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2704 0)) 2705 return REG_ERANGE; 2706 2707 /* We can handle no multi character collating elements without libc 2708 support. */ 2709 if (BE ((start_elem->type == COLL_SYM 2710 && strlen ((char *) start_elem->opr.name) > 1) 2711 || (end_elem->type == COLL_SYM 2712 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2713 return REG_ECOLLATE; 2714 2715# ifdef RE_ENABLE_I18N 2716 { 2717 wchar_t wc; 2718 wint_t start_wc; 2719 wint_t end_wc; 2720 2721 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2722 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2723 : 0)); 2724 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2725 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2726 : 0)); 2727 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2728 ? __btowc (start_ch) : start_elem->opr.wch); 2729 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2730 ? __btowc (end_ch) : end_elem->opr.wch); 2731 if (start_wc == WEOF || end_wc == WEOF) 2732 return REG_ECOLLATE; 2733 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0)) 2734 return REG_ERANGE; 2735 2736 /* Got valid collation sequence values, add them as a new entry. 2737 However, for !_LIBC we have no collation elements: if the 2738 character set is single byte, the single byte character set 2739 that we build below suffices. parse_bracket_exp passes 2740 no MBCSET if dfa->mb_cur_max == 1. */ 2741 if (mbcset) 2742 { 2743 /* Check the space of the arrays. */ 2744 if (BE (*range_alloc == mbcset->nranges, 0)) 2745 { 2746 /* There is not enough space, need realloc. */ 2747 wchar_t *new_array_start, *new_array_end; 2748 Idx new_nranges; 2749 2750 /* +1 in case of mbcset->nranges is 0. */ 2751 new_nranges = 2 * mbcset->nranges + 1; 2752 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2753 are NULL if *range_alloc == 0. */ 2754 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2755 new_nranges); 2756 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2757 new_nranges); 2758 2759 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2760 return REG_ESPACE; 2761 2762 mbcset->range_starts = new_array_start; 2763 mbcset->range_ends = new_array_end; 2764 *range_alloc = new_nranges; 2765 } 2766 2767 mbcset->range_starts[mbcset->nranges] = start_wc; 2768 mbcset->range_ends[mbcset->nranges++] = end_wc; 2769 } 2770 2771 /* Build the table for single byte characters. */ 2772 for (wc = 0; wc < SBC_MAX; ++wc) 2773 { 2774 if (start_wc <= wc && wc <= end_wc) 2775 bitset_set (sbcset, wc); 2776 } 2777 } 2778# else /* not RE_ENABLE_I18N */ 2779 { 2780 unsigned int ch; 2781 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2782 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2783 : 0)); 2784 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2785 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2786 : 0)); 2787 if (start_ch > end_ch) 2788 return REG_ERANGE; 2789 /* Build the table for single byte characters. */ 2790 for (ch = 0; ch < SBC_MAX; ++ch) 2791 if (start_ch <= ch && ch <= end_ch) 2792 bitset_set (sbcset, ch); 2793 } 2794# endif /* not RE_ENABLE_I18N */ 2795 return REG_NOERROR; 2796} 2797#endif /* not _LIBC */ 2798 2799#ifndef _LIBC 2800/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2801 Build the collating element which is represented by NAME. 2802 The result are written to MBCSET and SBCSET. 2803 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2804 pointer argument since we may update it. */ 2805 2806static reg_errcode_t 2807internal_function 2808# ifdef RE_ENABLE_I18N 2809build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 2810 Idx *coll_sym_alloc, const unsigned char *name) 2811# else /* not RE_ENABLE_I18N */ 2812build_collating_symbol (bitset_t sbcset, const unsigned char *name) 2813# endif /* not RE_ENABLE_I18N */ 2814{ 2815 size_t name_len = strlen ((const char *) name); 2816 if (BE (name_len != 1, 0)) 2817 return REG_ECOLLATE; 2818 else 2819 { 2820 bitset_set (sbcset, name[0]); 2821 return REG_NOERROR; 2822 } 2823} 2824#endif /* not _LIBC */ 2825 2826/* This function parse bracket expression like "[abc]", "[a-c]", 2827 "[[.a-a.]]" etc. */ 2828 2829static bin_tree_t * 2830parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2831 reg_syntax_t syntax, reg_errcode_t *err) 2832{ 2833#ifdef _LIBC 2834 const unsigned char *collseqmb; 2835 const char *collseqwc; 2836 uint32_t nrules; 2837 int32_t table_size; 2838 const int32_t *symb_table; 2839 const unsigned char *extra; 2840 2841 /* Local function for parse_bracket_exp used in _LIBC environment. 2842 Seek the collating symbol entry corresponding to NAME. 2843 Return the index of the symbol in the SYMB_TABLE, 2844 or -1 if not found. */ 2845 2846 auto inline int32_t 2847 __attribute__ ((always_inline)) 2848 seek_collating_symbol_entry (const unsigned char *name, size_t name_len) 2849 { 2850 int32_t elem; 2851 2852 for (elem = 0; elem < table_size; elem++) 2853 if (symb_table[2 * elem] != 0) 2854 { 2855 int32_t idx = symb_table[2 * elem + 1]; 2856 /* Skip the name of collating element name. */ 2857 idx += 1 + extra[idx]; 2858 if (/* Compare the length of the name. */ 2859 name_len == extra[idx] 2860 /* Compare the name. */ 2861 && memcmp (name, &extra[idx + 1], name_len) == 0) 2862 /* Yep, this is the entry. */ 2863 return elem; 2864 } 2865 return -1; 2866 } 2867 2868 /* Local function for parse_bracket_exp used in _LIBC environment. 2869 Look up the collation sequence value of BR_ELEM. 2870 Return the value if succeeded, UINT_MAX otherwise. */ 2871 2872 auto inline unsigned int 2873 __attribute__ ((always_inline)) 2874 lookup_collation_sequence_value (bracket_elem_t *br_elem) 2875 { 2876 if (br_elem->type == SB_CHAR) 2877 { 2878 /* 2879 if (MB_CUR_MAX == 1) 2880 */ 2881 if (nrules == 0) 2882 return collseqmb[br_elem->opr.ch]; 2883 else 2884 { 2885 wint_t wc = __btowc (br_elem->opr.ch); 2886 return __collseq_table_lookup (collseqwc, wc); 2887 } 2888 } 2889 else if (br_elem->type == MB_CHAR) 2890 { 2891 if (nrules != 0) 2892 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2893 } 2894 else if (br_elem->type == COLL_SYM) 2895 { 2896 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2897 if (nrules != 0) 2898 { 2899 int32_t elem, idx; 2900 elem = seek_collating_symbol_entry (br_elem->opr.name, 2901 sym_name_len); 2902 if (elem != -1) 2903 { 2904 /* We found the entry. */ 2905 idx = symb_table[2 * elem + 1]; 2906 /* Skip the name of collating element name. */ 2907 idx += 1 + extra[idx]; 2908 /* Skip the byte sequence of the collating element. */ 2909 idx += 1 + extra[idx]; 2910 /* Adjust for the alignment. */ 2911 idx = (idx + 3) & ~3; 2912 /* Skip the multibyte collation sequence value. */ 2913 idx += sizeof (unsigned int); 2914 /* Skip the wide char sequence of the collating element. */ 2915 idx += sizeof (unsigned int) * 2916 (1 + *(unsigned int *) (extra + idx)); 2917 /* Return the collation sequence value. */ 2918 return *(unsigned int *) (extra + idx); 2919 } 2920 else if (sym_name_len == 1) 2921 { 2922 /* No valid character. Match it as a single byte 2923 character. */ 2924 return collseqmb[br_elem->opr.name[0]]; 2925 } 2926 } 2927 else if (sym_name_len == 1) 2928 return collseqmb[br_elem->opr.name[0]]; 2929 } 2930 return UINT_MAX; 2931 } 2932 2933 /* Local function for parse_bracket_exp used in _LIBC environment. 2934 Build the range expression which starts from START_ELEM, and ends 2935 at END_ELEM. The result are written to MBCSET and SBCSET. 2936 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2937 mbcset->range_ends, is a pointer argument since we may 2938 update it. */ 2939 2940 auto inline reg_errcode_t 2941 __attribute__ ((always_inline)) 2942 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, 2943 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2944 { 2945 unsigned int ch; 2946 uint32_t start_collseq; 2947 uint32_t end_collseq; 2948 2949 /* Equivalence Classes and Character Classes can't be a range 2950 start/end. */ 2951 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2952 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2953 0)) 2954 return REG_ERANGE; 2955 2956 /* FIXME: Implement rational ranges here, too. */ 2957 start_collseq = lookup_collation_sequence_value (start_elem); 2958 end_collseq = lookup_collation_sequence_value (end_elem); 2959 /* Check start/end collation sequence values. */ 2960 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2961 return REG_ECOLLATE; 2962 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2963 return REG_ERANGE; 2964 2965 /* Got valid collation sequence values, add them as a new entry. 2966 However, if we have no collation elements, and the character set 2967 is single byte, the single byte character set that we 2968 build below suffices. */ 2969 if (nrules > 0 || dfa->mb_cur_max > 1) 2970 { 2971 /* Check the space of the arrays. */ 2972 if (BE (*range_alloc == mbcset->nranges, 0)) 2973 { 2974 /* There is not enough space, need realloc. */ 2975 uint32_t *new_array_start; 2976 uint32_t *new_array_end; 2977 Idx new_nranges; 2978 2979 /* +1 in case of mbcset->nranges is 0. */ 2980 new_nranges = 2 * mbcset->nranges + 1; 2981 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2982 new_nranges); 2983 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2984 new_nranges); 2985 2986 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2987 return REG_ESPACE; 2988 2989 mbcset->range_starts = new_array_start; 2990 mbcset->range_ends = new_array_end; 2991 *range_alloc = new_nranges; 2992 } 2993 2994 mbcset->range_starts[mbcset->nranges] = start_collseq; 2995 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2996 } 2997 2998 /* Build the table for single byte characters. */ 2999 for (ch = 0; ch < SBC_MAX; ch++) 3000 { 3001 uint32_t ch_collseq; 3002 /* 3003 if (MB_CUR_MAX == 1) 3004 */ 3005 if (nrules == 0) 3006 ch_collseq = collseqmb[ch]; 3007 else 3008 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 3009 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 3010 bitset_set (sbcset, ch); 3011 } 3012 return REG_NOERROR; 3013 } 3014 3015 /* Local function for parse_bracket_exp used in _LIBC environment. 3016 Build the collating element which is represented by NAME. 3017 The result are written to MBCSET and SBCSET. 3018 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 3019 pointer argument since we may update it. */ 3020 3021 auto inline reg_errcode_t 3022 __attribute__ ((always_inline)) 3023 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 3024 Idx *coll_sym_alloc, const unsigned char *name) 3025 { 3026 int32_t elem, idx; 3027 size_t name_len = strlen ((const char *) name); 3028 if (nrules != 0) 3029 { 3030 elem = seek_collating_symbol_entry (name, name_len); 3031 if (elem != -1) 3032 { 3033 /* We found the entry. */ 3034 idx = symb_table[2 * elem + 1]; 3035 /* Skip the name of collating element name. */ 3036 idx += 1 + extra[idx]; 3037 } 3038 else if (name_len == 1) 3039 { 3040 /* No valid character, treat it as a normal 3041 character. */ 3042 bitset_set (sbcset, name[0]); 3043 return REG_NOERROR; 3044 } 3045 else 3046 return REG_ECOLLATE; 3047 3048 /* Got valid collation sequence, add it as a new entry. */ 3049 /* Check the space of the arrays. */ 3050 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3051 { 3052 /* Not enough, realloc it. */ 3053 /* +1 in case of mbcset->ncoll_syms is 0. */ 3054 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3055 /* Use realloc since mbcset->coll_syms is NULL 3056 if *alloc == 0. */ 3057 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3058 new_coll_sym_alloc); 3059 if (BE (new_coll_syms == NULL, 0)) 3060 return REG_ESPACE; 3061 mbcset->coll_syms = new_coll_syms; 3062 *coll_sym_alloc = new_coll_sym_alloc; 3063 } 3064 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3065 return REG_NOERROR; 3066 } 3067 else 3068 { 3069 if (BE (name_len != 1, 0)) 3070 return REG_ECOLLATE; 3071 else 3072 { 3073 bitset_set (sbcset, name[0]); 3074 return REG_NOERROR; 3075 } 3076 } 3077 } 3078#endif 3079 3080 re_token_t br_token; 3081 re_bitset_ptr_t sbcset; 3082#ifdef RE_ENABLE_I18N 3083 re_charset_t *mbcset; 3084 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3085 Idx equiv_class_alloc = 0, char_class_alloc = 0; 3086#endif /* not RE_ENABLE_I18N */ 3087 bool non_match = false; 3088 bin_tree_t *work_tree; 3089 int token_len; 3090 bool first_round = true; 3091#ifdef _LIBC 3092 collseqmb = (const unsigned char *) 3093 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3094 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3095 if (nrules) 3096 { 3097 /* 3098 if (MB_CUR_MAX > 1) 3099 */ 3100 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3101 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3102 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3103 _NL_COLLATE_SYMB_TABLEMB); 3104 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3105 _NL_COLLATE_SYMB_EXTRAMB); 3106 } 3107#endif 3108 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3109#ifdef RE_ENABLE_I18N 3110 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3111#endif /* RE_ENABLE_I18N */ 3112#ifdef RE_ENABLE_I18N 3113 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3114#else 3115 if (BE (sbcset == NULL, 0)) 3116#endif /* RE_ENABLE_I18N */ 3117 { 3118 re_free (sbcset); 3119#ifdef RE_ENABLE_I18N 3120 re_free (mbcset); 3121#endif 3122 *err = REG_ESPACE; 3123 return NULL; 3124 } 3125 3126 token_len = peek_token_bracket (token, regexp, syntax); 3127 if (BE (token->type == END_OF_RE, 0)) 3128 { 3129 *err = REG_BADPAT; 3130 goto parse_bracket_exp_free_return; 3131 } 3132 if (token->type == OP_NON_MATCH_LIST) 3133 { 3134#ifdef RE_ENABLE_I18N 3135 mbcset->non_match = 1; 3136#endif /* not RE_ENABLE_I18N */ 3137 non_match = true; 3138 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3139 bitset_set (sbcset, '\n'); 3140 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3141 token_len = peek_token_bracket (token, regexp, syntax); 3142 if (BE (token->type == END_OF_RE, 0)) 3143 { 3144 *err = REG_BADPAT; 3145 goto parse_bracket_exp_free_return; 3146 } 3147 } 3148 3149 /* We treat the first ']' as a normal character. */ 3150 if (token->type == OP_CLOSE_BRACKET) 3151 token->type = CHARACTER; 3152 3153 while (1) 3154 { 3155 bracket_elem_t start_elem, end_elem; 3156 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3157 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3158 reg_errcode_t ret; 3159 int token_len2 = 0; 3160 bool is_range_exp = false; 3161 re_token_t token2; 3162 3163 start_elem.opr.name = start_name_buf; 3164 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3165 syntax, first_round); 3166 if (BE (ret != REG_NOERROR, 0)) 3167 { 3168 *err = ret; 3169 goto parse_bracket_exp_free_return; 3170 } 3171 first_round = false; 3172 3173 /* Get information about the next token. We need it in any case. */ 3174 token_len = peek_token_bracket (token, regexp, syntax); 3175 3176 /* Do not check for ranges if we know they are not allowed. */ 3177 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3178 { 3179 if (BE (token->type == END_OF_RE, 0)) 3180 { 3181 *err = REG_EBRACK; 3182 goto parse_bracket_exp_free_return; 3183 } 3184 if (token->type == OP_CHARSET_RANGE) 3185 { 3186 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3187 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3188 if (BE (token2.type == END_OF_RE, 0)) 3189 { 3190 *err = REG_EBRACK; 3191 goto parse_bracket_exp_free_return; 3192 } 3193 if (token2.type == OP_CLOSE_BRACKET) 3194 { 3195 /* We treat the last '-' as a normal character. */ 3196 re_string_skip_bytes (regexp, -token_len); 3197 token->type = CHARACTER; 3198 } 3199 else 3200 is_range_exp = true; 3201 } 3202 } 3203 3204 if (is_range_exp == true) 3205 { 3206 end_elem.opr.name = end_name_buf; 3207 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3208 dfa, syntax, true); 3209 if (BE (ret != REG_NOERROR, 0)) 3210 { 3211 *err = ret; 3212 goto parse_bracket_exp_free_return; 3213 } 3214 3215 token_len = peek_token_bracket (token, regexp, syntax); 3216 3217#ifdef _LIBC 3218 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3219 &start_elem, &end_elem); 3220#else 3221# ifdef RE_ENABLE_I18N 3222 *err = build_range_exp (syntax, sbcset, 3223 dfa->mb_cur_max > 1 ? mbcset : NULL, 3224 &range_alloc, &start_elem, &end_elem); 3225# else 3226 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); 3227# endif 3228#endif /* RE_ENABLE_I18N */ 3229 if (BE (*err != REG_NOERROR, 0)) 3230 goto parse_bracket_exp_free_return; 3231 } 3232 else 3233 { 3234 switch (start_elem.type) 3235 { 3236 case SB_CHAR: 3237 bitset_set (sbcset, start_elem.opr.ch); 3238 break; 3239#ifdef RE_ENABLE_I18N 3240 case MB_CHAR: 3241 /* Check whether the array has enough space. */ 3242 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3243 { 3244 wchar_t *new_mbchars; 3245 /* Not enough, realloc it. */ 3246 /* +1 in case of mbcset->nmbchars is 0. */ 3247 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3248 /* Use realloc since array is NULL if *alloc == 0. */ 3249 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3250 mbchar_alloc); 3251 if (BE (new_mbchars == NULL, 0)) 3252 goto parse_bracket_exp_espace; 3253 mbcset->mbchars = new_mbchars; 3254 } 3255 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3256 break; 3257#endif /* RE_ENABLE_I18N */ 3258 case EQUIV_CLASS: 3259 *err = build_equiv_class (sbcset, 3260#ifdef RE_ENABLE_I18N 3261 mbcset, &equiv_class_alloc, 3262#endif /* RE_ENABLE_I18N */ 3263 start_elem.opr.name); 3264 if (BE (*err != REG_NOERROR, 0)) 3265 goto parse_bracket_exp_free_return; 3266 break; 3267 case COLL_SYM: 3268 *err = build_collating_symbol (sbcset, 3269#ifdef RE_ENABLE_I18N 3270 mbcset, &coll_sym_alloc, 3271#endif /* RE_ENABLE_I18N */ 3272 start_elem.opr.name); 3273 if (BE (*err != REG_NOERROR, 0)) 3274 goto parse_bracket_exp_free_return; 3275 break; 3276 case CHAR_CLASS: 3277 *err = build_charclass (regexp->trans, sbcset, 3278#ifdef RE_ENABLE_I18N 3279 mbcset, &char_class_alloc, 3280#endif /* RE_ENABLE_I18N */ 3281 (const char *) start_elem.opr.name, 3282 syntax); 3283 if (BE (*err != REG_NOERROR, 0)) 3284 goto parse_bracket_exp_free_return; 3285 break; 3286 default: 3287 assert (0); 3288 break; 3289 } 3290 } 3291 if (BE (token->type == END_OF_RE, 0)) 3292 { 3293 *err = REG_EBRACK; 3294 goto parse_bracket_exp_free_return; 3295 } 3296 if (token->type == OP_CLOSE_BRACKET) 3297 break; 3298 } 3299 3300 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3301 3302 /* If it is non-matching list. */ 3303 if (non_match) 3304 bitset_not (sbcset); 3305 3306#ifdef RE_ENABLE_I18N 3307 /* Ensure only single byte characters are set. */ 3308 if (dfa->mb_cur_max > 1) 3309 bitset_mask (sbcset, dfa->sb_char); 3310 3311 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3312 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3313 || mbcset->non_match))) 3314 { 3315 bin_tree_t *mbc_tree; 3316 int sbc_idx; 3317 /* Build a tree for complex bracket. */ 3318 dfa->has_mb_node = 1; 3319 br_token.type = COMPLEX_BRACKET; 3320 br_token.opr.mbcset = mbcset; 3321 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3322 if (BE (mbc_tree == NULL, 0)) 3323 goto parse_bracket_exp_espace; 3324 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3325 if (sbcset[sbc_idx]) 3326 break; 3327 /* If there are no bits set in sbcset, there is no point 3328 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3329 if (sbc_idx < BITSET_WORDS) 3330 { 3331 /* Build a tree for simple bracket. */ 3332 br_token.type = SIMPLE_BRACKET; 3333 br_token.opr.sbcset = sbcset; 3334 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3335 if (BE (work_tree == NULL, 0)) 3336 goto parse_bracket_exp_espace; 3337 3338 /* Then join them by ALT node. */ 3339 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3340 if (BE (work_tree == NULL, 0)) 3341 goto parse_bracket_exp_espace; 3342 } 3343 else 3344 { 3345 re_free (sbcset); 3346 work_tree = mbc_tree; 3347 } 3348 } 3349 else 3350#endif /* not RE_ENABLE_I18N */ 3351 { 3352#ifdef RE_ENABLE_I18N 3353 free_charset (mbcset); 3354#endif 3355 /* Build a tree for simple bracket. */ 3356 br_token.type = SIMPLE_BRACKET; 3357 br_token.opr.sbcset = sbcset; 3358 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3359 if (BE (work_tree == NULL, 0)) 3360 goto parse_bracket_exp_espace; 3361 } 3362 return work_tree; 3363 3364 parse_bracket_exp_espace: 3365 *err = REG_ESPACE; 3366 parse_bracket_exp_free_return: 3367 re_free (sbcset); 3368#ifdef RE_ENABLE_I18N 3369 free_charset (mbcset); 3370#endif /* RE_ENABLE_I18N */ 3371 return NULL; 3372} 3373 3374/* Parse an element in the bracket expression. */ 3375 3376static reg_errcode_t 3377parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3378 re_token_t *token, int token_len, re_dfa_t *dfa, 3379 reg_syntax_t syntax, bool accept_hyphen) 3380{ 3381#ifdef RE_ENABLE_I18N 3382 int cur_char_size; 3383 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3384 if (cur_char_size > 1) 3385 { 3386 elem->type = MB_CHAR; 3387 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3388 re_string_skip_bytes (regexp, cur_char_size); 3389 return REG_NOERROR; 3390 } 3391#endif /* RE_ENABLE_I18N */ 3392 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3393 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3394 || token->type == OP_OPEN_EQUIV_CLASS) 3395 return parse_bracket_symbol (elem, regexp, token); 3396 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3397 { 3398 /* A '-' must only appear as anything but a range indicator before 3399 the closing bracket. Everything else is an error. */ 3400 re_token_t token2; 3401 (void) peek_token_bracket (&token2, regexp, syntax); 3402 if (token2.type != OP_CLOSE_BRACKET) 3403 /* The actual error value is not standardized since this whole 3404 case is undefined. But ERANGE makes good sense. */ 3405 return REG_ERANGE; 3406 } 3407 elem->type = SB_CHAR; 3408 elem->opr.ch = token->opr.c; 3409 return REG_NOERROR; 3410} 3411 3412/* Parse a bracket symbol in the bracket expression. Bracket symbols are 3413 such as [:<character_class>:], [.<collating_element>.], and 3414 [=<equivalent_class>=]. */ 3415 3416static reg_errcode_t 3417parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3418 re_token_t *token) 3419{ 3420 unsigned char ch, delim = token->opr.c; 3421 int i = 0; 3422 if (re_string_eoi(regexp)) 3423 return REG_EBRACK; 3424 for (;; ++i) 3425 { 3426 if (i >= BRACKET_NAME_BUF_SIZE) 3427 return REG_EBRACK; 3428 if (token->type == OP_OPEN_CHAR_CLASS) 3429 ch = re_string_fetch_byte_case (regexp); 3430 else 3431 ch = re_string_fetch_byte (regexp); 3432 if (re_string_eoi(regexp)) 3433 return REG_EBRACK; 3434 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3435 break; 3436 elem->opr.name[i] = ch; 3437 } 3438 re_string_skip_bytes (regexp, 1); 3439 elem->opr.name[i] = '\0'; 3440 switch (token->type) 3441 { 3442 case OP_OPEN_COLL_ELEM: 3443 elem->type = COLL_SYM; 3444 break; 3445 case OP_OPEN_EQUIV_CLASS: 3446 elem->type = EQUIV_CLASS; 3447 break; 3448 case OP_OPEN_CHAR_CLASS: 3449 elem->type = CHAR_CLASS; 3450 break; 3451 default: 3452 break; 3453 } 3454 return REG_NOERROR; 3455} 3456 3457 /* Helper function for parse_bracket_exp. 3458 Build the equivalence class which is represented by NAME. 3459 The result are written to MBCSET and SBCSET. 3460 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3461 is a pointer argument since we may update it. */ 3462 3463static reg_errcode_t 3464#ifdef RE_ENABLE_I18N 3465build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, 3466 Idx *equiv_class_alloc, const unsigned char *name) 3467#else /* not RE_ENABLE_I18N */ 3468build_equiv_class (bitset_t sbcset, const unsigned char *name) 3469#endif /* not RE_ENABLE_I18N */ 3470{ 3471#ifdef _LIBC 3472 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3473 if (nrules != 0) 3474 { 3475 const int32_t *table, *indirect; 3476 const unsigned char *weights, *extra, *cp; 3477 unsigned char char_buf[2]; 3478 int32_t idx1, idx2; 3479 unsigned int ch; 3480 size_t len; 3481 /* This #include defines a local function! */ 3482# include <locale/weight.h> 3483 /* Calculate the index for equivalence class. */ 3484 cp = name; 3485 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3486 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3487 _NL_COLLATE_WEIGHTMB); 3488 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3489 _NL_COLLATE_EXTRAMB); 3490 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3491 _NL_COLLATE_INDIRECTMB); 3492 idx1 = findidx (&cp, -1); 3493 if (BE (idx1 == 0 || *cp != '\0', 0)) 3494 /* This isn't a valid character. */ 3495 return REG_ECOLLATE; 3496 3497 /* Build single byte matching table for this equivalence class. */ 3498 len = weights[idx1 & 0xffffff]; 3499 for (ch = 0; ch < SBC_MAX; ++ch) 3500 { 3501 char_buf[0] = ch; 3502 cp = char_buf; 3503 idx2 = findidx (&cp, 1); 3504/* 3505 idx2 = table[ch]; 3506*/ 3507 if (idx2 == 0) 3508 /* This isn't a valid character. */ 3509 continue; 3510 /* Compare only if the length matches and the collation rule 3511 index is the same. */ 3512 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3513 { 3514 int cnt = 0; 3515 3516 while (cnt <= len && 3517 weights[(idx1 & 0xffffff) + 1 + cnt] 3518 == weights[(idx2 & 0xffffff) + 1 + cnt]) 3519 ++cnt; 3520 3521 if (cnt > len) 3522 bitset_set (sbcset, ch); 3523 } 3524 } 3525 /* Check whether the array has enough space. */ 3526 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3527 { 3528 /* Not enough, realloc it. */ 3529 /* +1 in case of mbcset->nequiv_classes is 0. */ 3530 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3531 /* Use realloc since the array is NULL if *alloc == 0. */ 3532 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3533 int32_t, 3534 new_equiv_class_alloc); 3535 if (BE (new_equiv_classes == NULL, 0)) 3536 return REG_ESPACE; 3537 mbcset->equiv_classes = new_equiv_classes; 3538 *equiv_class_alloc = new_equiv_class_alloc; 3539 } 3540 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3541 } 3542 else 3543#endif /* _LIBC */ 3544 { 3545 if (BE (strlen ((const char *) name) != 1, 0)) 3546 return REG_ECOLLATE; 3547 bitset_set (sbcset, *name); 3548 } 3549 return REG_NOERROR; 3550} 3551 3552 /* Helper function for parse_bracket_exp. 3553 Build the character class which is represented by NAME. 3554 The result are written to MBCSET and SBCSET. 3555 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3556 is a pointer argument since we may update it. */ 3557 3558static reg_errcode_t 3559#ifdef RE_ENABLE_I18N 3560build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3561 re_charset_t *mbcset, Idx *char_class_alloc, 3562 const char *class_name, reg_syntax_t syntax) 3563#else /* not RE_ENABLE_I18N */ 3564build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3565 const char *class_name, reg_syntax_t syntax) 3566#endif /* not RE_ENABLE_I18N */ 3567{ 3568 int i; 3569 const char *name = class_name; 3570 3571 /* In case of REG_ICASE "upper" and "lower" match the both of 3572 upper and lower cases. */ 3573 if ((syntax & RE_ICASE) 3574 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3575 name = "alpha"; 3576 3577#ifdef RE_ENABLE_I18N 3578 /* Check the space of the arrays. */ 3579 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3580 { 3581 /* Not enough, realloc it. */ 3582 /* +1 in case of mbcset->nchar_classes is 0. */ 3583 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3584 /* Use realloc since array is NULL if *alloc == 0. */ 3585 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3586 new_char_class_alloc); 3587 if (BE (new_char_classes == NULL, 0)) 3588 return REG_ESPACE; 3589 mbcset->char_classes = new_char_classes; 3590 *char_class_alloc = new_char_class_alloc; 3591 } 3592 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3593#endif /* RE_ENABLE_I18N */ 3594 3595#define BUILD_CHARCLASS_LOOP(ctype_func) \ 3596 do { \ 3597 if (BE (trans != NULL, 0)) \ 3598 { \ 3599 for (i = 0; i < SBC_MAX; ++i) \ 3600 if (ctype_func (i)) \ 3601 bitset_set (sbcset, trans[i]); \ 3602 } \ 3603 else \ 3604 { \ 3605 for (i = 0; i < SBC_MAX; ++i) \ 3606 if (ctype_func (i)) \ 3607 bitset_set (sbcset, i); \ 3608 } \ 3609 } while (0) 3610 3611 if (strcmp (name, "alnum") == 0) 3612 BUILD_CHARCLASS_LOOP (isalnum); 3613 else if (strcmp (name, "cntrl") == 0) 3614 BUILD_CHARCLASS_LOOP (iscntrl); 3615 else if (strcmp (name, "lower") == 0) 3616 BUILD_CHARCLASS_LOOP (islower); 3617 else if (strcmp (name, "space") == 0) 3618 BUILD_CHARCLASS_LOOP (isspace); 3619 else if (strcmp (name, "alpha") == 0) 3620 BUILD_CHARCLASS_LOOP (isalpha); 3621 else if (strcmp (name, "digit") == 0) 3622 BUILD_CHARCLASS_LOOP (isdigit); 3623 else if (strcmp (name, "print") == 0) 3624 BUILD_CHARCLASS_LOOP (isprint); 3625 else if (strcmp (name, "upper") == 0) 3626 BUILD_CHARCLASS_LOOP (isupper); 3627 else if (strcmp (name, "blank") == 0) 3628 BUILD_CHARCLASS_LOOP (isblank); 3629 else if (strcmp (name, "graph") == 0) 3630 BUILD_CHARCLASS_LOOP (isgraph); 3631 else if (strcmp (name, "punct") == 0) 3632 BUILD_CHARCLASS_LOOP (ispunct); 3633 else if (strcmp (name, "xdigit") == 0) 3634 BUILD_CHARCLASS_LOOP (isxdigit); 3635 else 3636 return REG_ECTYPE; 3637 3638 return REG_NOERROR; 3639} 3640 3641static bin_tree_t * 3642build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3643 const char *class_name, 3644 const char *extra, bool non_match, 3645 reg_errcode_t *err) 3646{ 3647 re_bitset_ptr_t sbcset; 3648#ifdef RE_ENABLE_I18N 3649 re_charset_t *mbcset; 3650 Idx alloc = 0; 3651#endif /* not RE_ENABLE_I18N */ 3652 reg_errcode_t ret; 3653 re_token_t br_token; 3654 bin_tree_t *tree; 3655 3656 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3657#ifdef RE_ENABLE_I18N 3658 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3659#endif /* RE_ENABLE_I18N */ 3660 3661#ifdef RE_ENABLE_I18N 3662 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3663#else /* not RE_ENABLE_I18N */ 3664 if (BE (sbcset == NULL, 0)) 3665#endif /* not RE_ENABLE_I18N */ 3666 { 3667 *err = REG_ESPACE; 3668 return NULL; 3669 } 3670 3671 if (non_match) 3672 { 3673#ifdef RE_ENABLE_I18N 3674 mbcset->non_match = 1; 3675#endif /* not RE_ENABLE_I18N */ 3676 } 3677 3678 /* We don't care the syntax in this case. */ 3679 ret = build_charclass (trans, sbcset, 3680#ifdef RE_ENABLE_I18N 3681 mbcset, &alloc, 3682#endif /* RE_ENABLE_I18N */ 3683 class_name, 0); 3684 3685 if (BE (ret != REG_NOERROR, 0)) 3686 { 3687 re_free (sbcset); 3688#ifdef RE_ENABLE_I18N 3689 free_charset (mbcset); 3690#endif /* RE_ENABLE_I18N */ 3691 *err = ret; 3692 return NULL; 3693 } 3694 /* \w match '_' also. */ 3695 for (; *extra; extra++) 3696 bitset_set (sbcset, *extra); 3697 3698 /* If it is non-matching list. */ 3699 if (non_match) 3700 bitset_not (sbcset); 3701 3702#ifdef RE_ENABLE_I18N 3703 /* Ensure only single byte characters are set. */ 3704 if (dfa->mb_cur_max > 1) 3705 bitset_mask (sbcset, dfa->sb_char); 3706#endif 3707 3708 /* Build a tree for simple bracket. */ 3709 br_token.type = SIMPLE_BRACKET; 3710 br_token.opr.sbcset = sbcset; 3711 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3712 if (BE (tree == NULL, 0)) 3713 goto build_word_op_espace; 3714 3715#ifdef RE_ENABLE_I18N 3716 if (dfa->mb_cur_max > 1) 3717 { 3718 bin_tree_t *mbc_tree; 3719 /* Build a tree for complex bracket. */ 3720 br_token.type = COMPLEX_BRACKET; 3721 br_token.opr.mbcset = mbcset; 3722 dfa->has_mb_node = 1; 3723 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3724 if (BE (mbc_tree == NULL, 0)) 3725 goto build_word_op_espace; 3726 /* Then join them by ALT node. */ 3727 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3728 if (BE (mbc_tree != NULL, 1)) 3729 return tree; 3730 } 3731 else 3732 { 3733 free_charset (mbcset); 3734 return tree; 3735 } 3736#else /* not RE_ENABLE_I18N */ 3737 return tree; 3738#endif /* not RE_ENABLE_I18N */ 3739 3740 build_word_op_espace: 3741 re_free (sbcset); 3742#ifdef RE_ENABLE_I18N 3743 free_charset (mbcset); 3744#endif /* RE_ENABLE_I18N */ 3745 *err = REG_ESPACE; 3746 return NULL; 3747} 3748 3749/* This is intended for the expressions like "a{1,3}". 3750 Fetch a number from 'input', and return the number. 3751 Return REG_MISSING if the number field is empty like "{,1}". 3752 Return RE_DUP_MAX + 1 if the number field is too large. 3753 Return REG_ERROR if an error occurred. */ 3754 3755static Idx 3756fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3757{ 3758 Idx num = REG_MISSING; 3759 unsigned char c; 3760 while (1) 3761 { 3762 fetch_token (token, input, syntax); 3763 c = token->opr.c; 3764 if (BE (token->type == END_OF_RE, 0)) 3765 return REG_ERROR; 3766 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3767 break; 3768 num = ((token->type != CHARACTER || c < '0' || '9' < c 3769 || num == REG_ERROR) 3770 ? REG_ERROR 3771 : num == REG_MISSING 3772 ? c - '0' 3773 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0')); 3774 } 3775 return num; 3776} 3777 3778#ifdef RE_ENABLE_I18N 3779static void 3780free_charset (re_charset_t *cset) 3781{ 3782 re_free (cset->mbchars); 3783# ifdef _LIBC 3784 re_free (cset->coll_syms); 3785 re_free (cset->equiv_classes); 3786 re_free (cset->range_starts); 3787 re_free (cset->range_ends); 3788# endif 3789 re_free (cset->char_classes); 3790 re_free (cset); 3791} 3792#endif /* RE_ENABLE_I18N */ 3793 3794/* Functions for binary tree operation. */ 3795 3796/* Create a tree node. */ 3797 3798static bin_tree_t * 3799create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3800 re_token_type_t type) 3801{ 3802 re_token_t t; 3803 t.type = type; 3804 return create_token_tree (dfa, left, right, &t); 3805} 3806 3807static bin_tree_t * 3808create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3809 const re_token_t *token) 3810{ 3811 bin_tree_t *tree; 3812 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3813 { 3814 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3815 3816 if (storage == NULL) 3817 return NULL; 3818 storage->next = dfa->str_tree_storage; 3819 dfa->str_tree_storage = storage; 3820 dfa->str_tree_storage_idx = 0; 3821 } 3822 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3823 3824 tree->parent = NULL; 3825 tree->left = left; 3826 tree->right = right; 3827 tree->token = *token; 3828 tree->token.duplicated = 0; 3829 tree->token.opt_subexp = 0; 3830 tree->first = NULL; 3831 tree->next = NULL; 3832 tree->node_idx = REG_MISSING; 3833 3834 if (left != NULL) 3835 left->parent = tree; 3836 if (right != NULL) 3837 right->parent = tree; 3838 return tree; 3839} 3840 3841/* Mark the tree SRC as an optional subexpression. 3842 To be called from preorder or postorder. */ 3843 3844static reg_errcode_t 3845mark_opt_subexp (void *extra, bin_tree_t *node) 3846{ 3847 Idx idx = (uintptr_t) extra; 3848 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3849 node->token.opt_subexp = 1; 3850 3851 return REG_NOERROR; 3852} 3853 3854/* Free the allocated memory inside NODE. */ 3855 3856static void 3857free_token (re_token_t *node) 3858{ 3859#ifdef RE_ENABLE_I18N 3860 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3861 free_charset (node->opr.mbcset); 3862 else 3863#endif /* RE_ENABLE_I18N */ 3864 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3865 re_free (node->opr.sbcset); 3866} 3867 3868/* Worker function for tree walking. Free the allocated memory inside NODE 3869 and its children. */ 3870 3871static reg_errcode_t 3872free_tree (void *extra, bin_tree_t *node) 3873{ 3874 free_token (&node->token); 3875 return REG_NOERROR; 3876} 3877 3878 3879/* Duplicate the node SRC, and return new node. This is a preorder 3880 visit similar to the one implemented by the generic visitor, but 3881 we need more infrastructure to maintain two parallel trees --- so, 3882 it's easier to duplicate. */ 3883 3884static bin_tree_t * 3885duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3886{ 3887 const bin_tree_t *node; 3888 bin_tree_t *dup_root; 3889 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3890 3891 for (node = root; ; ) 3892 { 3893 /* Create a new tree and link it back to the current parent. */ 3894 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3895 if (*p_new == NULL) 3896 return NULL; 3897 (*p_new)->parent = dup_node; 3898 (*p_new)->token.duplicated = 1; 3899 dup_node = *p_new; 3900 3901 /* Go to the left node, or up and to the right. */ 3902 if (node->left) 3903 { 3904 node = node->left; 3905 p_new = &dup_node->left; 3906 } 3907 else 3908 { 3909 const bin_tree_t *prev = NULL; 3910 while (node->right == prev || node->right == NULL) 3911 { 3912 prev = node; 3913 node = node->parent; 3914 dup_node = dup_node->parent; 3915 if (!node) 3916 return dup_root; 3917 } 3918 node = node->right; 3919 p_new = &dup_node->right; 3920 } 3921 } 3922} 3923