1/* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, write to the Free 18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19 02111-1307 USA. */ 20 21static reg_errcode_t re_compile_internal _RE_ARGS((regex_t *preg, const char * pattern, 22 int length, reg_syntax_t syntax)); 23static void re_compile_fastmap_iter _RE_ARGS((regex_t *bufp, 24 const re_dfastate_t *init_state, 25 char *fastmap)); 26static reg_errcode_t init_dfa _RE_ARGS((re_dfa_t *dfa, int pat_len)); 27static reg_errcode_t init_word_char _RE_ARGS((re_dfa_t *dfa)); 28#ifdef RE_ENABLE_I18N 29static void free_charset _RE_ARGS((re_charset_t *cset)); 30#endif /* RE_ENABLE_I18N */ 31static void free_workarea_compile _RE_ARGS((regex_t *preg)); 32static reg_errcode_t create_initial_state _RE_ARGS((re_dfa_t *dfa)); 33static reg_errcode_t analyze _RE_ARGS((re_dfa_t *dfa)); 34static reg_errcode_t analyze_tree _RE_ARGS((re_dfa_t *dfa, bin_tree_t *node)); 35static void calc_first _RE_ARGS((re_dfa_t *dfa, bin_tree_t *node)); 36static void calc_next _RE_ARGS((re_dfa_t *dfa, bin_tree_t *node)); 37static void calc_epsdest _RE_ARGS((re_dfa_t *dfa, bin_tree_t *node)); 38static reg_errcode_t duplicate_node_closure _RE_ARGS((re_dfa_t *dfa, int top_org_node, 39 int top_clone_node, int root_node, 40 unsigned int constraint)); 41static reg_errcode_t duplicate_node _RE_ARGS((int *new_idx, re_dfa_t *dfa, int org_idx, 42 unsigned int constraint)); 43static int search_duplicated_node _RE_ARGS((re_dfa_t *dfa, int org_node, 44 unsigned int constraint)); 45static reg_errcode_t calc_eclosure _RE_ARGS((re_dfa_t *dfa)); 46static reg_errcode_t calc_eclosure_iter _RE_ARGS((re_node_set *new_set, re_dfa_t *dfa, 47 int node, int root)); 48static void calc_inveclosure _RE_ARGS((re_dfa_t *dfa)); 49static int fetch_number _RE_ARGS((re_string_t *input, re_token_t *token, 50 reg_syntax_t syntax)); 51static re_token_t fetch_token _RE_ARGS((re_string_t *input, reg_syntax_t syntax)); 52static int peek_token _RE_ARGS((re_token_t *token, re_string_t *input, 53 reg_syntax_t syntax)); 54static int peek_token_bracket _RE_ARGS((re_token_t *token, re_string_t *input, 55 reg_syntax_t syntax)); 56static bin_tree_t *parse _RE_ARGS((re_string_t *regexp, regex_t *preg, 57 reg_syntax_t syntax, reg_errcode_t *err)); 58static bin_tree_t *parse_reg_exp _RE_ARGS((re_string_t *regexp, regex_t *preg, 59 re_token_t *token, reg_syntax_t syntax, 60 int nest, reg_errcode_t *err)); 61static bin_tree_t *parse_branch _RE_ARGS((re_string_t *regexp, regex_t *preg, 62 re_token_t *token, reg_syntax_t syntax, 63 int nest, reg_errcode_t *err)); 64static bin_tree_t *parse_expression _RE_ARGS((re_string_t *regexp, regex_t *preg, 65 re_token_t *token, reg_syntax_t syntax, 66 int nest, reg_errcode_t *err)); 67static bin_tree_t *parse_sub_exp _RE_ARGS((re_string_t *regexp, regex_t *preg, 68 re_token_t *token, reg_syntax_t syntax, 69 int nest, reg_errcode_t *err)); 70static bin_tree_t *parse_dup_op _RE_ARGS((bin_tree_t *dup_elem, re_string_t *regexp, 71 re_dfa_t *dfa, re_token_t *token, 72 reg_syntax_t syntax, reg_errcode_t *err)); 73static bin_tree_t *parse_bracket_exp _RE_ARGS((re_string_t *regexp, re_dfa_t *dfa, 74 re_token_t *token, reg_syntax_t syntax, 75 reg_errcode_t *err)); 76static reg_errcode_t parse_bracket_element _RE_ARGS((bracket_elem_t *elem, 77 re_string_t *regexp, 78 re_token_t *token, int token_len, 79 re_dfa_t *dfa, 80 reg_syntax_t syntax)); 81static reg_errcode_t parse_bracket_symbol _RE_ARGS((bracket_elem_t *elem, 82 re_string_t *regexp, 83 re_token_t *token)); 84#ifndef _LIBC 85# ifdef RE_ENABLE_I18N 86static reg_errcode_t build_range_exp _RE_ARGS((re_bitset_ptr_t sbcset, 87 re_charset_t *mbcset, int *range_alloc, 88 bracket_elem_t *start_elem, 89 bracket_elem_t *end_elem)); 90static reg_errcode_t build_collating_symbol _RE_ARGS((re_bitset_ptr_t sbcset, 91 re_charset_t *mbcset, 92 int *coll_sym_alloc, 93 const unsigned char *name)); 94# else /* not RE_ENABLE_I18N */ 95static reg_errcode_t build_range_exp _RE_ARGS((re_bitset_ptr_t sbcset, 96 bracket_elem_t *start_elem, 97 bracket_elem_t *end_elem)); 98static reg_errcode_t build_collating_symbol _RE_ARGS((re_bitset_ptr_t sbcset, 99 const unsigned char *name)); 100# endif /* not RE_ENABLE_I18N */ 101#endif /* not _LIBC */ 102#ifdef RE_ENABLE_I18N 103static reg_errcode_t build_equiv_class _RE_ARGS((re_bitset_ptr_t sbcset, 104 re_charset_t *mbcset, 105 int *equiv_class_alloc, 106 const unsigned char *name)); 107static reg_errcode_t build_charclass _RE_ARGS((RE_TRANSLATE_TYPE trans, 108 re_bitset_ptr_t sbcset, 109 re_charset_t *mbcset, 110 int *char_class_alloc, 111 const unsigned char *class_name, 112 reg_syntax_t syntax)); 113#else /* not RE_ENABLE_I18N */ 114static reg_errcode_t build_equiv_class _RE_ARGS((re_bitset_ptr_t sbcset, 115 const unsigned char *name)); 116static reg_errcode_t build_charclass _RE_ARGS((RE_TRANSLATE_TYPE trans, 117 re_bitset_ptr_t sbcset, 118 const unsigned char *class_name, 119 reg_syntax_t syntax)); 120#endif /* not RE_ENABLE_I18N */ 121static bin_tree_t *build_word_op _RE_ARGS((re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 122 int not, reg_errcode_t *err)); 123static void free_bin_tree _RE_ARGS((bin_tree_t *tree)); 124static bin_tree_t *create_tree _RE_ARGS((bin_tree_t *left, bin_tree_t *right, 125 re_token_type_t type, int index)); 126static bin_tree_t *duplicate_tree _RE_ARGS((const bin_tree_t *src, re_dfa_t *dfa)); 127 128/* This table gives an error message for each of the error codes listed 129 in regex.h. Obviously the order here has to be same as there. 130 POSIX doesn't require that we do anything for REG_NOERROR, 131 but why not be nice? */ 132 133const char __re_error_msgid[] attribute_hidden = 134 { 135#define REG_NOERROR_IDX 0 136 gettext_noop ("Success") /* REG_NOERROR */ 137 "\0" 138#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 139 gettext_noop ("No match") /* REG_NOMATCH */ 140 "\0" 141#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 142 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 143 "\0" 144#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 145 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 146 "\0" 147#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 148 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 149 "\0" 150#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 151 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 152 "\0" 153#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 154 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 155 "\0" 156#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 157 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 158 "\0" 159#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 160 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 161 "\0" 162#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 163 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 164 "\0" 165#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 166 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 167 "\0" 168#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 169 gettext_noop ("Invalid range end") /* REG_ERANGE */ 170 "\0" 171#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 172 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 173 "\0" 174#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 175 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 176 "\0" 177#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 178 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 179 "\0" 180#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 181 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 182 "\0" 183#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 184 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 185 }; 186 187const size_t __re_error_msgid_idx[] attribute_hidden = 188 { 189 REG_NOERROR_IDX, 190 REG_NOMATCH_IDX, 191 REG_BADPAT_IDX, 192 REG_ECOLLATE_IDX, 193 REG_ECTYPE_IDX, 194 REG_EESCAPE_IDX, 195 REG_ESUBREG_IDX, 196 REG_EBRACK_IDX, 197 REG_EPAREN_IDX, 198 REG_EBRACE_IDX, 199 REG_BADBR_IDX, 200 REG_ERANGE_IDX, 201 REG_ESPACE_IDX, 202 REG_BADRPT_IDX, 203 REG_EEND_IDX, 204 REG_ESIZE_IDX, 205 REG_ERPAREN_IDX 206 }; 207 208/* Entry points for GNU code. */ 209 210/* re_compile_pattern is the GNU regular expression compiler: it 211 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 212 Returns 0 if the pattern was valid, otherwise an error string. 213 214 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 215 are set in BUFP on entry. */ 216 217const char * 218re_compile_pattern (pattern, length, bufp) 219 const char *pattern; 220 size_t length; 221 struct re_pattern_buffer *bufp; 222{ 223 reg_errcode_t ret; 224 225 /* And GNU code determines whether or not to get register information 226 by passing null for the REGS argument to re_match, etc., not by 227 setting no_sub. */ 228 bufp->no_sub = 0; 229 230 /* Match anchors at newline. */ 231 bufp->newline_anchor = 1; 232 233 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 234 235 if (!ret) 236 return NULL; 237 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 238} 239#ifdef _LIBC 240weak_alias (__re_compile_pattern, re_compile_pattern) 241#endif 242 243/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 244 also be assigned to arbitrarily: each pattern buffer stores its own 245 syntax, so it can be changed between regex compilations. */ 246/* This has no initializer because initialized variables in Emacs 247 become read-only after dumping. */ 248reg_syntax_t re_syntax_options; 249 250 251/* Specify the precise syntax of regexps for compilation. This provides 252 for compatibility for various utilities which historically have 253 different, incompatible syntaxes. 254 255 The argument SYNTAX is a bit mask comprised of the various bits 256 defined in regex.h. We return the old syntax. */ 257 258reg_syntax_t 259re_set_syntax (syntax) 260 reg_syntax_t syntax; 261{ 262 reg_syntax_t ret = re_syntax_options; 263 264 re_syntax_options = syntax; 265#ifdef RE_ENABLE_I18N 266 re_mb_cur_max = MB_CUR_MAX; 267#endif 268 return ret; 269} 270#ifdef _LIBC 271weak_alias (__re_set_syntax, re_set_syntax) 272#endif 273 274int 275re_compile_fastmap (bufp) 276 struct re_pattern_buffer *bufp; 277{ 278 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 279 char *fastmap = bufp->fastmap; 280 281 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 282 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 283 if (dfa->init_state != dfa->init_state_word) 284 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 285 if (dfa->init_state != dfa->init_state_nl) 286 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 287 if (dfa->init_state != dfa->init_state_begbuf) 288 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 289 bufp->fastmap_accurate = 1; 290 return 0; 291} 292#ifdef _LIBC 293weak_alias (__re_compile_fastmap, re_compile_fastmap) 294#endif 295 296static inline void 297re_set_fastmap (char *fastmap, int icase, int ch) 298{ 299 fastmap[ch] = 1; 300 if (icase) 301 fastmap[tolower (ch)] = 1; 302} 303 304/* Helper function for re_compile_fastmap. 305 Compile fastmap for the initial_state INIT_STATE. */ 306 307static void 308re_compile_fastmap_iter (bufp, init_state, fastmap) 309 regex_t *bufp; 310 const re_dfastate_t *init_state; 311 char *fastmap; 312{ 313 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 314 int node_cnt; 315#ifdef RE_ENABLE_I18N 316 int icase = (re_mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 317#else 318 int icase = (bufp->syntax & RE_ICASE); 319#endif 320 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 321 { 322 int node = init_state->nodes.elems[node_cnt]; 323 re_token_type_t type = dfa->nodes[node].type; 324 325 if (type == CHARACTER) 326 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 327 else if (type == SIMPLE_BRACKET) 328 { 329 int i, j, ch; 330 for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 331 for (j = 0; j < UINT_BITS; ++j, ++ch) 332 if (dfa->nodes[node].opr.sbcset[i] & (1UL << j)) 333 re_set_fastmap (fastmap, icase, ch); 334 } 335#ifdef RE_ENABLE_I18N 336 else if (type == COMPLEX_BRACKET) 337 { 338 int i; 339 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 340 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes 341 || cset->nranges || cset->nchar_classes) 342 { 343# ifdef _LIBC 344 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) 345 { 346 /* In this case we want to catch the bytes which are 347 the first byte of any collation elements. 348 e.g. In da_DK, we want to catch 'a' since "aa" 349 is a valid collation element, and don't catch 350 'b' since 'b' is the only collation element 351 which starts from 'b'. */ 352 int j, ch; 353 const int32_t *table = (const int32_t *) 354 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 355 for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 356 for (j = 0; j < UINT_BITS; ++j, ++ch) 357 if (table[ch] < 0) 358 re_set_fastmap (fastmap, icase, ch); 359 } 360# else 361 if (re_mb_cur_max > 1) 362 for (i = 0; i < SBC_MAX; ++i) 363 if (__btowc (i) == WEOF) 364 re_set_fastmap (fastmap, icase, i); 365# endif /* not _LIBC */ 366 } 367 for (i = 0; i < cset->nmbchars; ++i) 368 { 369 char buf[256]; 370 mbstate_t state; 371 memset (&state, '\0', sizeof (state)); 372 __wcrtomb (buf, cset->mbchars[i], &state); 373 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 374 } 375 } 376#endif /* RE_ENABLE_I18N */ 377 else if (type == END_OF_RE || type == OP_PERIOD) 378 { 379 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 380 if (type == END_OF_RE) 381 bufp->can_be_null = 1; 382 return; 383 } 384 } 385} 386 387/* Entry point for POSIX code. */ 388/* regcomp takes a regular expression as a string and compiles it. 389 390 PREG is a regex_t *. We do not expect any fields to be initialized, 391 since POSIX says we shouldn't. Thus, we set 392 393 `buffer' to the compiled pattern; 394 `used' to the length of the compiled pattern; 395 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 396 REG_EXTENDED bit in CFLAGS is set; otherwise, to 397 RE_SYNTAX_POSIX_BASIC; 398 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 399 `fastmap' to an allocated space for the fastmap; 400 `fastmap_accurate' to zero; 401 `re_nsub' to the number of subexpressions in PATTERN. 402 403 PATTERN is the address of the pattern string. 404 405 CFLAGS is a series of bits which affect compilation. 406 407 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 408 use POSIX basic syntax. 409 410 If REG_NEWLINE is set, then . and [^...] don't match newline. 411 Also, regexec will try a match beginning after every newline. 412 413 If REG_ICASE is set, then we considers upper- and lowercase 414 versions of letters to be equivalent when matching. 415 416 If REG_NOSUB is set, then when PREG is passed to regexec, that 417 routine will report only success or failure, and nothing about the 418 registers. 419 420 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 421 the return codes and their meanings.) */ 422 423int 424regcomp (preg, pattern, cflags) 425 regex_t *__restrict preg; 426 const char *__restrict pattern; 427 int cflags; 428{ 429 reg_errcode_t ret; 430 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 431 : RE_SYNTAX_POSIX_BASIC); 432 433 preg->buffer = NULL; 434 preg->allocated = 0; 435 preg->used = 0; 436 437 /* Try to allocate space for the fastmap. */ 438 preg->fastmap = re_malloc (char, SBC_MAX); 439 if (BE (preg->fastmap == NULL, 0)) 440 return REG_ESPACE; 441 442 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 443 444 /* If REG_NEWLINE is set, newlines are treated differently. */ 445 if (cflags & REG_NEWLINE) 446 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 447 syntax &= ~RE_DOT_NEWLINE; 448 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 449 /* It also changes the matching behavior. */ 450 preg->newline_anchor = 1; 451 } 452 else 453 preg->newline_anchor = 0; 454 preg->no_sub = !!(cflags & REG_NOSUB); 455 preg->translate = NULL; 456 457 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 458 459 /* POSIX doesn't distinguish between an unmatched open-group and an 460 unmatched close-group: both are REG_EPAREN. */ 461 if (ret == REG_ERPAREN) 462 ret = REG_EPAREN; 463 464 /* We have already checked preg->fastmap != NULL. */ 465 if (BE (ret == REG_NOERROR, 1)) 466 /* Compute the fastmap now, since regexec cannot modify the pattern 467 buffer. This function nevers fails in this implementation. */ 468 (void) re_compile_fastmap (preg); 469 else 470 { 471 /* Some error occurred while compiling the expression. */ 472 re_free (preg->fastmap); 473 preg->fastmap = NULL; 474 } 475 476 return (int) ret; 477} 478#ifdef _LIBC 479weak_alias (__regcomp, regcomp) 480#endif 481 482/* Returns a message corresponding to an error code, ERRCODE, returned 483 from either regcomp or regexec. We don't use PREG here. */ 484 485size_t 486regerror (errcode, preg, errbuf, errbuf_size) 487 int errcode; 488 const regex_t *preg; 489 char *errbuf; 490 size_t errbuf_size; 491{ 492 const char *msg; 493 size_t msg_size; 494 495 if (BE (errcode < 0 496 || errcode >= (int) (sizeof (__re_error_msgid_idx) 497 / sizeof (__re_error_msgid_idx[0])), 0)) 498 /* Only error codes returned by the rest of the code should be passed 499 to this routine. If we are given anything else, or if other regex 500 code generates an invalid error code, then the program has a bug. 501 Dump core so we can fix it. */ 502 abort (); 503 504 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 505 506 msg_size = strlen (msg) + 1; /* Includes the null. */ 507 508 if (BE (errbuf_size != 0, 1)) 509 { 510 if (BE (msg_size > errbuf_size, 0)) 511 { 512 memcpy (errbuf, msg, errbuf_size - 1); 513 errbuf[errbuf_size - 1] = 0; 514 } 515 else 516 memcpy (errbuf, msg, msg_size); 517 } 518 519 return msg_size; 520} 521#ifdef _LIBC 522weak_alias (__regerror, regerror) 523#endif 524 525 526static void 527free_dfa_content (re_dfa_t *dfa) 528{ 529 int i, j; 530 531 re_free (dfa->subexps); 532 533 for (i = 0; i < dfa->nodes_len; ++i) 534 { 535 re_token_t *node = dfa->nodes + i; 536#ifdef RE_ENABLE_I18N 537 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 538 free_charset (node->opr.mbcset); 539 else 540#endif /* RE_ENABLE_I18N */ 541 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 542 re_free (node->opr.sbcset); 543 } 544 re_free (dfa->nexts); 545 for (i = 0; i < dfa->nodes_len; ++i) 546 { 547 if (dfa->eclosures != NULL) 548 re_node_set_free (dfa->eclosures + i); 549 if (dfa->inveclosures != NULL) 550 re_node_set_free (dfa->inveclosures + i); 551 if (dfa->edests != NULL) 552 re_node_set_free (dfa->edests + i); 553 } 554 re_free (dfa->edests); 555 re_free (dfa->eclosures); 556 re_free (dfa->inveclosures); 557 re_free (dfa->nodes); 558 559 for (i = 0; i <= dfa->state_hash_mask; ++i) 560 { 561 struct re_state_table_entry *entry = dfa->state_table + i; 562 for (j = 0; j < entry->num; ++j) 563 { 564 re_dfastate_t *state = entry->array[j]; 565 free_state (state); 566 } 567 re_free (entry->array); 568 } 569 re_free (dfa->state_table); 570 571 if (dfa->word_char != NULL) 572 re_free (dfa->word_char); 573#ifdef DEBUG 574 re_free (dfa->re_str); 575#endif 576 577 re_free (dfa); 578} 579 580 581/* Free dynamically allocated space used by PREG. */ 582 583void 584regfree (preg) 585 regex_t *preg; 586{ 587 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 588 if (BE (dfa != NULL, 1)) 589 free_dfa_content (dfa); 590 591 re_free (preg->fastmap); 592} 593#ifdef _LIBC 594weak_alias (__regfree, regfree) 595#endif 596 597/* Entry points compatible with 4.2 BSD regex library. We don't define 598 them unless specifically requested. */ 599 600#if defined _REGEX_RE_COMP || defined _LIBC 601 602/* BSD has one and only one pattern buffer. */ 603static struct re_pattern_buffer re_comp_buf; 604 605char * 606# ifdef _LIBC 607/* Make these definitions weak in libc, so POSIX programs can redefine 608 these names if they don't use our functions, and still use 609 regcomp/regexec above without link errors. */ 610weak_function 611# endif 612re_comp (s) 613 const char *s; 614{ 615 reg_errcode_t ret; 616 char *fastmap; 617 618 if (!s) 619 { 620 if (!re_comp_buf.buffer) 621 return gettext ("No previous regular expression"); 622 return 0; 623 } 624 625 if (re_comp_buf.buffer) 626 { 627 fastmap = re_comp_buf.fastmap; 628 re_comp_buf.fastmap = NULL; 629 __regfree (&re_comp_buf); 630 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 631 re_comp_buf.fastmap = fastmap; 632 } 633 634 if (re_comp_buf.fastmap == NULL) 635 { 636 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 637 if (re_comp_buf.fastmap == NULL) 638 return (char *) gettext (__re_error_msgid 639 + __re_error_msgid_idx[(int) REG_ESPACE]); 640 } 641 642 /* Since `re_exec' always passes NULL for the `regs' argument, we 643 don't need to initialize the pattern buffer fields which affect it. */ 644 645 /* Match anchors at newlines. */ 646 re_comp_buf.newline_anchor = 1; 647 648 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 649 650 if (!ret) 651 return NULL; 652 653 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 654 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 655} 656 657#ifdef _LIBC 658libc_freeres_fn (free_mem) 659{ 660 __regfree (&re_comp_buf); 661} 662#endif 663 664#endif /* _REGEX_RE_COMP */ 665 666/* Internal entry point. 667 Compile the regular expression PATTERN, whose length is LENGTH. 668 SYNTAX indicate regular expression's syntax. */ 669 670static reg_errcode_t 671re_compile_internal (preg, pattern, length, syntax) 672 regex_t *preg; 673 const char * pattern; 674 int length; 675 reg_syntax_t syntax; 676{ 677 reg_errcode_t err = REG_NOERROR; 678 re_dfa_t *dfa; 679 re_string_t regexp; 680 681 /* Initialize the pattern buffer. */ 682 preg->fastmap_accurate = 0; 683 preg->syntax = syntax; 684 preg->not_bol = preg->not_eol = 0; 685 preg->used = 0; 686 preg->re_nsub = 0; 687 preg->can_be_null = 0; 688 preg->regs_allocated = REGS_UNALLOCATED; 689 690 /* Initialize the dfa. */ 691 dfa = (re_dfa_t *) preg->buffer; 692 if (preg->allocated < sizeof (re_dfa_t)) 693 { 694 /* If zero allocated, but buffer is non-null, try to realloc 695 enough space. This loses if buffer's address is bogus, but 696 that is the user's responsibility. If ->buffer is NULL this 697 is a simple allocation. */ 698 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 699 if (dfa == NULL) 700 return REG_ESPACE; 701 preg->allocated = sizeof (re_dfa_t); 702 } 703 preg->buffer = (unsigned char *) dfa; 704 preg->used = sizeof (re_dfa_t); 705 706 err = init_dfa (dfa, length); 707 if (BE (err != REG_NOERROR, 0)) 708 { 709 re_free (dfa); 710 preg->buffer = NULL; 711 preg->allocated = 0; 712 return err; 713 } 714#ifdef DEBUG 715 dfa->re_str = re_malloc (char, length + 1); 716 strncpy (dfa->re_str, pattern, length + 1); 717#endif 718 719 err = re_string_construct (®exp, pattern, length, preg->translate, 720 syntax & RE_ICASE); 721 if (BE (err != REG_NOERROR, 0)) 722 { 723 re_free (dfa); 724 preg->buffer = NULL; 725 preg->allocated = 0; 726 return err; 727 } 728 729 /* Parse the regular expression, and build a structure tree. */ 730 preg->re_nsub = 0; 731 dfa->str_tree = parse (®exp, preg, syntax, &err); 732 if (BE (dfa->str_tree == NULL, 0)) 733 goto re_compile_internal_free_return; 734 735 /* Analyze the tree and collect information which is necessary to 736 create the dfa. */ 737 err = analyze (dfa); 738 if (BE (err != REG_NOERROR, 0)) 739 goto re_compile_internal_free_return; 740 741 /* Then create the initial state of the dfa. */ 742 err = create_initial_state (dfa); 743 744 /* Release work areas. */ 745 free_workarea_compile (preg); 746 re_string_destruct (®exp); 747 748 if (BE (err != REG_NOERROR, 0)) 749 { 750 re_compile_internal_free_return: 751 free_dfa_content (dfa); 752 preg->buffer = NULL; 753 preg->allocated = 0; 754 } 755 756 return err; 757} 758 759/* Initialize DFA. We use the length of the regular expression PAT_LEN 760 as the initial length of some arrays. */ 761 762static reg_errcode_t 763init_dfa (dfa, pat_len) 764 re_dfa_t *dfa; 765 int pat_len; 766{ 767 int table_size; 768 769 memset (dfa, '\0', sizeof (re_dfa_t)); 770 771 dfa->nodes_alloc = pat_len + 1; 772 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 773 774 dfa->states_alloc = pat_len + 1; 775 776 /* table_size = 2 ^ ceil(log pat_len) */ 777 for (table_size = 1; table_size > 0; table_size <<= 1) 778 if (table_size > pat_len) 779 break; 780 781 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 782 dfa->state_hash_mask = table_size - 1; 783 784 dfa->subexps_alloc = 1; 785 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc); 786 dfa->word_char = NULL; 787 788 if (BE (dfa->nodes == NULL || dfa->state_table == NULL 789 || dfa->subexps == NULL, 0)) 790 { 791 /* We don't bother to free anything which was allocated. Very 792 soon the process will go down anyway. */ 793 dfa->subexps = NULL; 794 dfa->state_table = NULL; 795 dfa->nodes = NULL; 796 return REG_ESPACE; 797 } 798 return REG_NOERROR; 799} 800 801/* Initialize WORD_CHAR table, which indicate which character is 802 "word". In this case "word" means that it is the word construction 803 character used by some operators like "\<", "\>", etc. */ 804 805static reg_errcode_t 806init_word_char (dfa) 807 re_dfa_t *dfa; 808{ 809 int i, j, ch; 810 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); 811 if (BE (dfa->word_char == NULL, 0)) 812 return REG_ESPACE; 813 for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 814 for (j = 0; j < UINT_BITS; ++j, ++ch) 815 if (isalnum (ch) || ch == '_') 816 dfa->word_char[i] |= 1UL << j; 817 return REG_NOERROR; 818} 819 820/* Free the work area which are only used while compiling. */ 821 822static void 823free_workarea_compile (preg) 824 regex_t *preg; 825{ 826 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 827 free_bin_tree (dfa->str_tree); 828 dfa->str_tree = NULL; 829 re_free (dfa->org_indices); 830 dfa->org_indices = NULL; 831} 832 833/* Create initial states for all contexts. */ 834 835static reg_errcode_t 836create_initial_state (dfa) 837 re_dfa_t *dfa; 838{ 839 int first, i; 840 reg_errcode_t err; 841 re_node_set init_nodes; 842 843 /* Initial states have the epsilon closure of the node which is 844 the first node of the regular expression. */ 845 first = dfa->str_tree->first; 846 dfa->init_node = first; 847 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 848 if (BE (err != REG_NOERROR, 0)) 849 return err; 850 851 /* The back-references which are in initial states can epsilon transit, 852 since in this case all of the subexpressions can be null. 853 Then we add epsilon closures of the nodes which are the next nodes of 854 the back-references. */ 855 if (dfa->nbackref > 0) 856 for (i = 0; i < init_nodes.nelem; ++i) 857 { 858 int node_idx = init_nodes.elems[i]; 859 re_token_type_t type = dfa->nodes[node_idx].type; 860 861 int clexp_idx; 862 if (type != OP_BACK_REF) 863 continue; 864 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 865 { 866 re_token_t *clexp_node; 867 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 868 if (clexp_node->type == OP_CLOSE_SUBEXP 869 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx) 870 break; 871 } 872 if (clexp_idx == init_nodes.nelem) 873 continue; 874 875 if (type == OP_BACK_REF) 876 { 877 int dest_idx = dfa->edests[node_idx].elems[0]; 878 if (!re_node_set_contains (&init_nodes, dest_idx)) 879 { 880 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 881 i = 0; 882 } 883 } 884 } 885 886 /* It must be the first time to invoke acquire_state. */ 887 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 888 /* We don't check ERR here, since the initial state must not be NULL. */ 889 if (BE (dfa->init_state == NULL, 0)) 890 return err; 891 if (dfa->init_state->has_constraint) 892 { 893 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 894 CONTEXT_WORD); 895 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 896 CONTEXT_NEWLINE); 897 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 898 &init_nodes, 899 CONTEXT_NEWLINE 900 | CONTEXT_BEGBUF); 901 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 902 || dfa->init_state_begbuf == NULL, 0)) 903 return err; 904 } 905 else 906 dfa->init_state_word = dfa->init_state_nl 907 = dfa->init_state_begbuf = dfa->init_state; 908 909 re_node_set_free (&init_nodes); 910 return REG_NOERROR; 911} 912 913/* Analyze the structure tree, and calculate "first", "next", "edest", 914 "eclosure", and "inveclosure". */ 915 916static reg_errcode_t 917analyze (dfa) 918 re_dfa_t *dfa; 919{ 920 int i; 921 reg_errcode_t ret; 922 923 /* Allocate arrays. */ 924 dfa->nexts = re_malloc (int, dfa->nodes_alloc); 925 dfa->org_indices = re_malloc (int, dfa->nodes_alloc); 926 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 927 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 928 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc); 929 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 930 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0)) 931 return REG_ESPACE; 932 /* Initialize them. */ 933 for (i = 0; i < dfa->nodes_len; ++i) 934 { 935 dfa->nexts[i] = -1; 936 re_node_set_init_empty (dfa->edests + i); 937 re_node_set_init_empty (dfa->eclosures + i); 938 re_node_set_init_empty (dfa->inveclosures + i); 939 } 940 941 ret = analyze_tree (dfa, dfa->str_tree); 942 if (BE (ret == REG_NOERROR, 1)) 943 { 944 ret = calc_eclosure (dfa); 945 if (ret == REG_NOERROR) 946 calc_inveclosure (dfa); 947 } 948 return ret; 949} 950 951/* Helper functions for analyze. 952 This function calculate "first", "next", and "edest" for the subtree 953 whose root is NODE. */ 954 955static reg_errcode_t 956analyze_tree (dfa, node) 957 re_dfa_t *dfa; 958 bin_tree_t *node; 959{ 960 reg_errcode_t ret; 961 if (node->first == -1) 962 calc_first (dfa, node); 963 if (node->next == -1) 964 calc_next (dfa, node); 965 if (node->eclosure.nelem == 0) 966 calc_epsdest (dfa, node); 967 /* Calculate "first" etc. for the left child. */ 968 if (node->left != NULL) 969 { 970 ret = analyze_tree (dfa, node->left); 971 if (BE (ret != REG_NOERROR, 0)) 972 return ret; 973 } 974 /* Calculate "first" etc. for the right child. */ 975 if (node->right != NULL) 976 { 977 ret = analyze_tree (dfa, node->right); 978 if (BE (ret != REG_NOERROR, 0)) 979 return ret; 980 } 981 return REG_NOERROR; 982} 983 984/* Calculate "first" for the node NODE. */ 985static void 986calc_first (dfa, node) 987 re_dfa_t *dfa; 988 bin_tree_t *node; 989{ 990 int idx, type; 991 idx = node->node_idx; 992 type = (node->type == 0) ? dfa->nodes[idx].type : node->type; 993 994 switch (type) 995 { 996#ifdef DEBUG 997 case OP_OPEN_BRACKET: 998 case OP_CLOSE_BRACKET: 999 case OP_OPEN_DUP_NUM: 1000 case OP_CLOSE_DUP_NUM: 1001 case OP_NON_MATCH_LIST: 1002 case OP_OPEN_COLL_ELEM: 1003 case OP_CLOSE_COLL_ELEM: 1004 case OP_OPEN_EQUIV_CLASS: 1005 case OP_CLOSE_EQUIV_CLASS: 1006 case OP_OPEN_CHAR_CLASS: 1007 case OP_CLOSE_CHAR_CLASS: 1008 /* These must not be appeared here. */ 1009 assert (0); 1010#endif 1011 case END_OF_RE: 1012 case CHARACTER: 1013 case OP_PERIOD: 1014 case OP_DUP_ASTERISK: 1015 case OP_DUP_QUESTION: 1016#ifdef RE_ENABLE_I18N 1017 case COMPLEX_BRACKET: 1018#endif /* RE_ENABLE_I18N */ 1019 case SIMPLE_BRACKET: 1020 case OP_BACK_REF: 1021 case ANCHOR: 1022 case OP_OPEN_SUBEXP: 1023 case OP_CLOSE_SUBEXP: 1024 node->first = idx; 1025 break; 1026 case OP_DUP_PLUS: 1027#ifdef DEBUG 1028 assert (node->left != NULL); 1029#endif 1030 if (node->left->first == -1) 1031 calc_first (dfa, node->left); 1032 node->first = node->left->first; 1033 break; 1034 case OP_ALT: 1035 node->first = idx; 1036 break; 1037 /* else fall through */ 1038 default: 1039#ifdef DEBUG 1040 assert (node->left != NULL); 1041#endif 1042 if (node->left->first == -1) 1043 calc_first (dfa, node->left); 1044 node->first = node->left->first; 1045 break; 1046 } 1047} 1048 1049/* Calculate "next" for the node NODE. */ 1050 1051static void 1052calc_next (dfa, node) 1053 re_dfa_t *dfa; 1054 bin_tree_t *node; 1055{ 1056 int idx, type; 1057 bin_tree_t *parent = node->parent; 1058 if (parent == NULL) 1059 { 1060 node->next = -1; 1061 idx = node->node_idx; 1062 if (node->type == 0) 1063 dfa->nexts[idx] = node->next; 1064 return; 1065 } 1066 1067 idx = parent->node_idx; 1068 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type; 1069 1070 switch (type) 1071 { 1072 case OP_DUP_ASTERISK: 1073 case OP_DUP_PLUS: 1074 node->next = idx; 1075 break; 1076 case CONCAT: 1077 if (parent->left == node) 1078 { 1079 if (parent->right->first == -1) 1080 calc_first (dfa, parent->right); 1081 node->next = parent->right->first; 1082 break; 1083 } 1084 /* else fall through */ 1085 default: 1086 if (parent->next == -1) 1087 calc_next (dfa, parent); 1088 node->next = parent->next; 1089 break; 1090 } 1091 idx = node->node_idx; 1092 if (node->type == 0) 1093 dfa->nexts[idx] = node->next; 1094} 1095 1096/* Calculate "edest" for the node NODE. */ 1097 1098static void 1099calc_epsdest (dfa, node) 1100 re_dfa_t *dfa; 1101 bin_tree_t *node; 1102{ 1103 int idx; 1104 idx = node->node_idx; 1105 if (node->type == 0) 1106 { 1107 if (dfa->nodes[idx].type == OP_DUP_ASTERISK 1108 || dfa->nodes[idx].type == OP_DUP_PLUS 1109 || dfa->nodes[idx].type == OP_DUP_QUESTION) 1110 { 1111 if (node->left->first == -1) 1112 calc_first (dfa, node->left); 1113 if (node->next == -1) 1114 calc_next (dfa, node); 1115 re_node_set_init_2 (dfa->edests + idx, node->left->first, 1116 node->next); 1117 } 1118 else if (dfa->nodes[idx].type == OP_ALT) 1119 { 1120 int left, right; 1121 if (node->left != NULL) 1122 { 1123 if (node->left->first == -1) 1124 calc_first (dfa, node->left); 1125 left = node->left->first; 1126 } 1127 else 1128 { 1129 if (node->next == -1) 1130 calc_next (dfa, node); 1131 left = node->next; 1132 } 1133 if (node->right != NULL) 1134 { 1135 if (node->right->first == -1) 1136 calc_first (dfa, node->right); 1137 right = node->right->first; 1138 } 1139 else 1140 { 1141 if (node->next == -1) 1142 calc_next (dfa, node); 1143 right = node->next; 1144 } 1145 re_node_set_init_2 (dfa->edests + idx, left, right); 1146 } 1147 else if (dfa->nodes[idx].type == ANCHOR 1148 || dfa->nodes[idx].type == OP_OPEN_SUBEXP 1149 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP 1150 || dfa->nodes[idx].type == OP_BACK_REF) 1151 re_node_set_init_1 (dfa->edests + idx, node->next); 1152 } 1153} 1154 1155/* Duplicate the epsilon closure of the node ROOT_NODE. 1156 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1157 to their own constraint. */ 1158 1159static reg_errcode_t 1160duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, 1161 init_constraint) 1162 re_dfa_t *dfa; 1163 int top_org_node, top_clone_node, root_node; 1164 unsigned int init_constraint; 1165{ 1166 reg_errcode_t err; 1167 int org_node, clone_node, ret; 1168 unsigned int constraint = init_constraint; 1169 for (org_node = top_org_node, clone_node = top_clone_node;;) 1170 { 1171 int org_dest, clone_dest; 1172 if (dfa->nodes[org_node].type == OP_BACK_REF) 1173 { 1174 /* If the back reference epsilon-transit, its destination must 1175 also have the constraint. Then duplicate the epsilon closure 1176 of the destination of the back reference, and store it in 1177 edests of the back reference. */ 1178 org_dest = dfa->nexts[org_node]; 1179 re_node_set_empty (dfa->edests + clone_node); 1180 err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1181 if (BE (err != REG_NOERROR, 0)) 1182 return err; 1183 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1184 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1185 if (BE (ret < 0, 0)) 1186 return REG_ESPACE; 1187 } 1188 else if (dfa->edests[org_node].nelem == 0) 1189 { 1190 /* In case of the node can't epsilon-transit, don't duplicate the 1191 destination and store the original destination as the 1192 destination of the node. */ 1193 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1194 break; 1195 } 1196 else if (dfa->edests[org_node].nelem == 1) 1197 { 1198 /* In case of the node can epsilon-transit, and it has only one 1199 destination. */ 1200 org_dest = dfa->edests[org_node].elems[0]; 1201 re_node_set_empty (dfa->edests + clone_node); 1202 if (dfa->nodes[org_node].type == ANCHOR) 1203 { 1204 /* In case of the node has another constraint, append it. */ 1205 if (org_node == root_node && clone_node != org_node) 1206 { 1207 /* ...but if the node is root_node itself, it means the 1208 epsilon closure have a loop, then tie it to the 1209 destination of the root_node. */ 1210 ret = re_node_set_insert (dfa->edests + clone_node, 1211 org_dest); 1212 if (BE (ret < 0, 0)) 1213 return REG_ESPACE; 1214 break; 1215 } 1216 constraint |= dfa->nodes[org_node].opr.ctx_type; 1217 } 1218 err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1219 if (BE (err != REG_NOERROR, 0)) 1220 return err; 1221 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1222 if (BE (ret < 0, 0)) 1223 return REG_ESPACE; 1224 } 1225 else /* dfa->edests[org_node].nelem == 2 */ 1226 { 1227 /* In case of the node can epsilon-transit, and it has two 1228 destinations. E.g. '|', '*', '+', '?'. */ 1229 org_dest = dfa->edests[org_node].elems[0]; 1230 re_node_set_empty (dfa->edests + clone_node); 1231 /* Search for a duplicated node which satisfies the constraint. */ 1232 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1233 if (clone_dest == -1) 1234 { 1235 /* There are no such a duplicated node, create a new one. */ 1236 err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1237 if (BE (err != REG_NOERROR, 0)) 1238 return err; 1239 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1240 if (BE (ret < 0, 0)) 1241 return REG_ESPACE; 1242 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1243 root_node, constraint); 1244 if (BE (err != REG_NOERROR, 0)) 1245 return err; 1246 } 1247 else 1248 { 1249 /* There are a duplicated node which satisfy the constraint, 1250 use it to avoid infinite loop. */ 1251 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1252 if (BE (ret < 0, 0)) 1253 return REG_ESPACE; 1254 } 1255 1256 org_dest = dfa->edests[org_node].elems[1]; 1257 err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1258 if (BE (err != REG_NOERROR, 0)) 1259 return err; 1260 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1261 if (BE (ret < 0, 0)) 1262 return REG_ESPACE; 1263 } 1264 org_node = org_dest; 1265 clone_node = clone_dest; 1266 } 1267 return REG_NOERROR; 1268} 1269 1270/* Search for a node which is duplicated from the node ORG_NODE, and 1271 satisfies the constraint CONSTRAINT. */ 1272 1273static int 1274search_duplicated_node (dfa, org_node, constraint) 1275 re_dfa_t *dfa; 1276 int org_node; 1277 unsigned int constraint; 1278{ 1279 int idx; 1280 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1281 { 1282 if (org_node == dfa->org_indices[idx] 1283 && constraint == dfa->nodes[idx].constraint) 1284 return idx; /* Found. */ 1285 } 1286 return -1; /* Not found. */ 1287} 1288 1289/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1290 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded, 1291 otherwise return the error code. */ 1292 1293static reg_errcode_t 1294duplicate_node (new_idx, dfa, org_idx, constraint) 1295 re_dfa_t *dfa; 1296 int *new_idx, org_idx; 1297 unsigned int constraint; 1298{ 1299 re_token_t dup; 1300 int dup_idx; 1301 1302 dup = dfa->nodes[org_idx]; 1303 dup_idx = re_dfa_add_node (dfa, dup, 1); 1304 if (BE (dup_idx == -1, 0)) 1305 return REG_ESPACE; 1306 dfa->nodes[dup_idx].constraint = constraint; 1307 if (dfa->nodes[org_idx].type == ANCHOR) 1308 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; 1309 dfa->nodes[dup_idx].duplicated = 1; 1310 re_node_set_init_empty (dfa->edests + dup_idx); 1311 re_node_set_init_empty (dfa->eclosures + dup_idx); 1312 re_node_set_init_empty (dfa->inveclosures + dup_idx); 1313 1314 /* Store the index of the original node. */ 1315 dfa->org_indices[dup_idx] = org_idx; 1316 *new_idx = dup_idx; 1317 return REG_NOERROR; 1318} 1319 1320static void 1321calc_inveclosure (dfa) 1322 re_dfa_t *dfa; 1323{ 1324 int src, idx, dest; 1325 for (src = 0; src < dfa->nodes_len; ++src) 1326 { 1327 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1328 { 1329 dest = dfa->eclosures[src].elems[idx]; 1330 re_node_set_insert (dfa->inveclosures + dest, src); 1331 } 1332 } 1333} 1334 1335/* Calculate "eclosure" for all the node in DFA. */ 1336 1337static reg_errcode_t 1338calc_eclosure (dfa) 1339 re_dfa_t *dfa; 1340{ 1341 int node_idx, incomplete; 1342#ifdef DEBUG 1343 assert (dfa->nodes_len > 0); 1344#endif 1345 incomplete = 0; 1346 /* For each nodes, calculate epsilon closure. */ 1347 for (node_idx = 0; ; ++node_idx) 1348 { 1349 reg_errcode_t err; 1350 re_node_set eclosure_elem; 1351 if (node_idx == dfa->nodes_len) 1352 { 1353 if (!incomplete) 1354 break; 1355 incomplete = 0; 1356 node_idx = 0; 1357 } 1358 1359#ifdef DEBUG 1360 assert (dfa->eclosures[node_idx].nelem != -1); 1361#endif 1362 /* If we have already calculated, skip it. */ 1363 if (dfa->eclosures[node_idx].nelem != 0) 1364 continue; 1365 /* Calculate epsilon closure of `node_idx'. */ 1366 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1); 1367 if (BE (err != REG_NOERROR, 0)) 1368 return err; 1369 1370 if (dfa->eclosures[node_idx].nelem == 0) 1371 { 1372 incomplete = 1; 1373 re_node_set_free (&eclosure_elem); 1374 } 1375 } 1376 return REG_NOERROR; 1377} 1378 1379/* Calculate epsilon closure of NODE. */ 1380 1381static reg_errcode_t 1382calc_eclosure_iter (new_set, dfa, node, root) 1383 re_node_set *new_set; 1384 re_dfa_t *dfa; 1385 int node, root; 1386{ 1387 reg_errcode_t err; 1388 unsigned int constraint; 1389 int i, incomplete; 1390 re_node_set eclosure; 1391 incomplete = 0; 1392 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1393 if (BE (err != REG_NOERROR, 0)) 1394 return err; 1395 1396 /* This indicates that we are calculating this node now. 1397 We reference this value to avoid infinite loop. */ 1398 dfa->eclosures[node].nelem = -1; 1399 1400 constraint = ((dfa->nodes[node].type == ANCHOR) 1401 ? dfa->nodes[node].opr.ctx_type : 0); 1402 /* If the current node has constraints, duplicate all nodes. 1403 Since they must inherit the constraints. */ 1404 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1405 { 1406 int org_node, cur_node; 1407 org_node = cur_node = node; 1408 err = duplicate_node_closure (dfa, node, node, node, constraint); 1409 if (BE (err != REG_NOERROR, 0)) 1410 return err; 1411 } 1412 1413 /* Expand each epsilon destination nodes. */ 1414 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1415 for (i = 0; i < dfa->edests[node].nelem; ++i) 1416 { 1417 re_node_set eclosure_elem; 1418 int edest = dfa->edests[node].elems[i]; 1419 /* If calculating the epsilon closure of `edest' is in progress, 1420 return intermediate result. */ 1421 if (dfa->eclosures[edest].nelem == -1) 1422 { 1423 incomplete = 1; 1424 continue; 1425 } 1426 /* If we haven't calculated the epsilon closure of `edest' yet, 1427 calculate now. Otherwise use calculated epsilon closure. */ 1428 if (dfa->eclosures[edest].nelem == 0) 1429 { 1430 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0); 1431 if (BE (err != REG_NOERROR, 0)) 1432 return err; 1433 } 1434 else 1435 eclosure_elem = dfa->eclosures[edest]; 1436 /* Merge the epsilon closure of `edest'. */ 1437 re_node_set_merge (&eclosure, &eclosure_elem); 1438 /* If the epsilon closure of `edest' is incomplete, 1439 the epsilon closure of this node is also incomplete. */ 1440 if (dfa->eclosures[edest].nelem == 0) 1441 { 1442 incomplete = 1; 1443 re_node_set_free (&eclosure_elem); 1444 } 1445 } 1446 1447 /* Epsilon closures include itself. */ 1448 re_node_set_insert (&eclosure, node); 1449 if (incomplete && !root) 1450 dfa->eclosures[node].nelem = 0; 1451 else 1452 dfa->eclosures[node] = eclosure; 1453 *new_set = eclosure; 1454 return REG_NOERROR; 1455} 1456 1457/* Functions for token which are used in the parser. */ 1458 1459/* Fetch a token from INPUT. 1460 We must not use this function inside bracket expressions. */ 1461 1462static re_token_t 1463fetch_token (input, syntax) 1464 re_string_t *input; 1465 reg_syntax_t syntax; 1466{ 1467 re_token_t token; 1468 int consumed_byte; 1469 consumed_byte = peek_token (&token, input, syntax); 1470 re_string_skip_bytes (input, consumed_byte); 1471 return token; 1472} 1473 1474/* Peek a token from INPUT, and return the length of the token. 1475 We must not use this function inside bracket expressions. */ 1476 1477static int 1478peek_token (token, input, syntax) 1479 re_token_t *token; 1480 re_string_t *input; 1481 reg_syntax_t syntax; 1482{ 1483 unsigned char c; 1484 1485 if (re_string_eoi (input)) 1486 { 1487 token->type = END_OF_RE; 1488 return 0; 1489 } 1490 1491 c = re_string_peek_byte (input, 0); 1492 token->opr.c = c; 1493 1494#ifdef RE_ENABLE_I18N 1495 token->mb_partial = 0; 1496 if (re_mb_cur_max > 1 && 1497 !re_string_first_byte (input, re_string_cur_idx (input))) 1498 { 1499 token->type = CHARACTER; 1500 token->mb_partial = 1; 1501 return 1; 1502 } 1503#endif 1504 if (c == '\\') 1505 { 1506 unsigned char c2; 1507 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1508 { 1509 token->type = BACK_SLASH; 1510 return 1; 1511 } 1512 1513 c2 = re_string_peek_byte_case (input, 1); 1514 token->opr.c = c2; 1515 token->type = CHARACTER; 1516 switch (c2) 1517 { 1518 case '|': 1519 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1520 token->type = OP_ALT; 1521 break; 1522 case '1': case '2': case '3': case '4': case '5': 1523 case '6': case '7': case '8': case '9': 1524 if (!(syntax & RE_NO_BK_REFS)) 1525 { 1526 token->type = OP_BACK_REF; 1527 token->opr.idx = c2 - '0'; 1528 } 1529 break; 1530 case '<': 1531 if (!(syntax & RE_NO_GNU_OPS)) 1532 { 1533 token->type = ANCHOR; 1534 token->opr.idx = WORD_FIRST; 1535 } 1536 break; 1537 case '>': 1538 if (!(syntax & RE_NO_GNU_OPS)) 1539 { 1540 token->type = ANCHOR; 1541 token->opr.idx = WORD_LAST; 1542 } 1543 break; 1544 case 'b': 1545 if (!(syntax & RE_NO_GNU_OPS)) 1546 { 1547 token->type = ANCHOR; 1548 token->opr.idx = WORD_DELIM; 1549 } 1550 break; 1551 case 'B': 1552 if (!(syntax & RE_NO_GNU_OPS)) 1553 { 1554 token->type = ANCHOR; 1555 token->opr.idx = INSIDE_WORD; 1556 } 1557 break; 1558 case 'w': 1559 if (!(syntax & RE_NO_GNU_OPS)) 1560 token->type = OP_WORD; 1561 break; 1562 case 'W': 1563 if (!(syntax & RE_NO_GNU_OPS)) 1564 token->type = OP_NOTWORD; 1565 break; 1566 case '`': 1567 if (!(syntax & RE_NO_GNU_OPS)) 1568 { 1569 token->type = ANCHOR; 1570 token->opr.idx = BUF_FIRST; 1571 } 1572 break; 1573 case '\'': 1574 if (!(syntax & RE_NO_GNU_OPS)) 1575 { 1576 token->type = ANCHOR; 1577 token->opr.idx = BUF_LAST; 1578 } 1579 break; 1580 case '(': 1581 if (!(syntax & RE_NO_BK_PARENS)) 1582 token->type = OP_OPEN_SUBEXP; 1583 break; 1584 case ')': 1585 if (!(syntax & RE_NO_BK_PARENS)) 1586 token->type = OP_CLOSE_SUBEXP; 1587 break; 1588 case '+': 1589 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1590 token->type = OP_DUP_PLUS; 1591 break; 1592 case '?': 1593 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1594 token->type = OP_DUP_QUESTION; 1595 break; 1596 case '{': 1597 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1598 token->type = OP_OPEN_DUP_NUM; 1599 break; 1600 case '}': 1601 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1602 token->type = OP_CLOSE_DUP_NUM; 1603 break; 1604 default: 1605 break; 1606 } 1607 return 2; 1608 } 1609 1610 token->type = CHARACTER; 1611 switch (c) 1612 { 1613 case '\n': 1614 if (syntax & RE_NEWLINE_ALT) 1615 token->type = OP_ALT; 1616 break; 1617 case '|': 1618 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1619 token->type = OP_ALT; 1620 break; 1621 case '*': 1622 token->type = OP_DUP_ASTERISK; 1623 break; 1624 case '+': 1625 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1626 token->type = OP_DUP_PLUS; 1627 break; 1628 case '?': 1629 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1630 token->type = OP_DUP_QUESTION; 1631 break; 1632 case '{': 1633 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1634 token->type = OP_OPEN_DUP_NUM; 1635 break; 1636 case '}': 1637 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1638 token->type = OP_CLOSE_DUP_NUM; 1639 break; 1640 case '(': 1641 if (syntax & RE_NO_BK_PARENS) 1642 token->type = OP_OPEN_SUBEXP; 1643 break; 1644 case ')': 1645 if (syntax & RE_NO_BK_PARENS) 1646 token->type = OP_CLOSE_SUBEXP; 1647 break; 1648 case '[': 1649 token->type = OP_OPEN_BRACKET; 1650 break; 1651 case '.': 1652 token->type = OP_PERIOD; 1653 break; 1654 case '^': 1655 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1656 re_string_cur_idx (input) != 0) 1657 { 1658 char prev = re_string_peek_byte (input, -1); 1659 if (prev != '|' && prev != '(' && 1660 (!(syntax & RE_NEWLINE_ALT) || prev != '\n')) 1661 break; 1662 } 1663 token->type = ANCHOR; 1664 token->opr.idx = LINE_FIRST; 1665 break; 1666 case '$': 1667 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1668 re_string_cur_idx (input) + 1 != re_string_length (input)) 1669 { 1670 re_token_t next; 1671 re_string_skip_bytes (input, 1); 1672 peek_token (&next, input, syntax); 1673 re_string_skip_bytes (input, -1); 1674 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1675 break; 1676 } 1677 token->type = ANCHOR; 1678 token->opr.idx = LINE_LAST; 1679 break; 1680 default: 1681 break; 1682 } 1683 return 1; 1684} 1685 1686/* Peek a token from INPUT, and return the length of the token. 1687 We must not use this function out of bracket expressions. */ 1688 1689static int 1690peek_token_bracket (token, input, syntax) 1691 re_token_t *token; 1692 re_string_t *input; 1693 reg_syntax_t syntax; 1694{ 1695 unsigned char c; 1696 if (re_string_eoi (input)) 1697 { 1698 token->type = END_OF_RE; 1699 return 0; 1700 } 1701 c = re_string_peek_byte (input, 0); 1702 token->opr.c = c; 1703 1704#ifdef RE_ENABLE_I18N 1705 if (re_mb_cur_max > 1 && 1706 !re_string_first_byte (input, re_string_cur_idx (input))) 1707 { 1708 token->type = CHARACTER; 1709 return 1; 1710 } 1711#endif /* RE_ENABLE_I18N */ 1712 1713 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1714 { 1715 /* In this case, '\' escape a character. */ 1716 unsigned char c2; 1717 re_string_skip_bytes (input, 1); 1718 c2 = re_string_peek_byte (input, 0); 1719 token->opr.c = c2; 1720 token->type = CHARACTER; 1721 return 1; 1722 } 1723 if (c == '[') /* '[' is a special char in a bracket exps. */ 1724 { 1725 unsigned char c2; 1726 int token_len; 1727 c2 = re_string_peek_byte (input, 1); 1728 token->opr.c = c2; 1729 token_len = 2; 1730 switch (c2) 1731 { 1732 case '.': 1733 token->type = OP_OPEN_COLL_ELEM; 1734 break; 1735 case '=': 1736 token->type = OP_OPEN_EQUIV_CLASS; 1737 break; 1738 case ':': 1739 if (syntax & RE_CHAR_CLASSES) 1740 { 1741 token->type = OP_OPEN_CHAR_CLASS; 1742 break; 1743 } 1744 /* else fall through. */ 1745 default: 1746 token->type = CHARACTER; 1747 token->opr.c = c; 1748 token_len = 1; 1749 break; 1750 } 1751 return token_len; 1752 } 1753 switch (c) 1754 { 1755 case '-': 1756 token->type = OP_CHARSET_RANGE; 1757 break; 1758 case ']': 1759 token->type = OP_CLOSE_BRACKET; 1760 break; 1761 case '^': 1762 token->type = OP_NON_MATCH_LIST; 1763 break; 1764 default: 1765 token->type = CHARACTER; 1766 } 1767 return 1; 1768} 1769 1770/* Functions for parser. */ 1771 1772/* Entry point of the parser. 1773 Parse the regular expression REGEXP and return the structure tree. 1774 If an error is occured, ERR is set by error code, and return NULL. 1775 This function build the following tree, from regular expression <reg_exp>: 1776 CAT 1777 / \ 1778 / \ 1779 <reg_exp> EOR 1780 1781 CAT means concatenation. 1782 EOR means end of regular expression. */ 1783 1784static bin_tree_t * 1785parse (regexp, preg, syntax, err) 1786 re_string_t *regexp; 1787 regex_t *preg; 1788 reg_syntax_t syntax; 1789 reg_errcode_t *err; 1790{ 1791 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1792 bin_tree_t *tree, *eor, *root; 1793 re_token_t current_token; 1794 int new_idx; 1795 current_token = fetch_token (regexp, syntax); 1796 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 1797 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 1798 return NULL; 1799 new_idx = re_dfa_add_node (dfa, current_token, 0); 1800 eor = create_tree (NULL, NULL, 0, new_idx); 1801 if (tree != NULL) 1802 root = create_tree (tree, eor, CONCAT, 0); 1803 else 1804 root = eor; 1805 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0)) 1806 { 1807 *err = REG_ESPACE; 1808 return NULL; 1809 } 1810 return root; 1811} 1812 1813/* This function build the following tree, from regular expression 1814 <branch1>|<branch2>: 1815 ALT 1816 / \ 1817 / \ 1818 <branch1> <branch2> 1819 1820 ALT means alternative, which represents the operator `|'. */ 1821 1822static bin_tree_t * 1823parse_reg_exp (regexp, preg, token, syntax, nest, err) 1824 re_string_t *regexp; 1825 regex_t *preg; 1826 re_token_t *token; 1827 reg_syntax_t syntax; 1828 int nest; 1829 reg_errcode_t *err; 1830{ 1831 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1832 bin_tree_t *tree, *branch = NULL; 1833 int new_idx; 1834 tree = parse_branch (regexp, preg, token, syntax, nest, err); 1835 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 1836 return NULL; 1837 1838 while (token->type == OP_ALT) 1839 { 1840 re_token_t alt_token; 1841 alt_token = *token; 1842 new_idx = re_dfa_add_node (dfa, alt_token, 0); 1843 *token = fetch_token (regexp, syntax); 1844 if (token->type != OP_ALT && token->type != END_OF_RE 1845 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 1846 { 1847 branch = parse_branch (regexp, preg, token, syntax, nest, err); 1848 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 1849 { 1850 free_bin_tree (tree); 1851 return NULL; 1852 } 1853 } 1854 else 1855 branch = NULL; 1856 tree = create_tree (tree, branch, 0, new_idx); 1857 if (BE (new_idx == -1 || tree == NULL, 0)) 1858 { 1859 *err = REG_ESPACE; 1860 return NULL; 1861 } 1862 dfa->has_plural_match = 1; 1863 } 1864 return tree; 1865} 1866 1867/* This function build the following tree, from regular expression 1868 <exp1><exp2>: 1869 CAT 1870 / \ 1871 / \ 1872 <exp1> <exp2> 1873 1874 CAT means concatenation. */ 1875 1876static bin_tree_t * 1877parse_branch (regexp, preg, token, syntax, nest, err) 1878 re_string_t *regexp; 1879 regex_t *preg; 1880 re_token_t *token; 1881 reg_syntax_t syntax; 1882 int nest; 1883 reg_errcode_t *err; 1884{ 1885 bin_tree_t *tree, *exp; 1886 tree = parse_expression (regexp, preg, token, syntax, nest, err); 1887 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 1888 return NULL; 1889 1890 while (token->type != OP_ALT && token->type != END_OF_RE 1891 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 1892 { 1893 exp = parse_expression (regexp, preg, token, syntax, nest, err); 1894 if (BE (*err != REG_NOERROR && exp == NULL, 0)) 1895 { 1896 free_bin_tree (tree); 1897 return NULL; 1898 } 1899 if (tree != NULL && exp != NULL) 1900 { 1901 tree = create_tree (tree, exp, CONCAT, 0); 1902 if (tree == NULL) 1903 { 1904 *err = REG_ESPACE; 1905 return NULL; 1906 } 1907 } 1908 else if (tree == NULL) 1909 tree = exp; 1910 /* Otherwise exp == NULL, we don't need to create new tree. */ 1911 } 1912 return tree; 1913} 1914 1915/* This function build the following tree, from regular expression a*: 1916 * 1917 | 1918 a 1919*/ 1920 1921static bin_tree_t * 1922parse_expression (regexp, preg, token, syntax, nest, err) 1923 re_string_t *regexp; 1924 regex_t *preg; 1925 re_token_t *token; 1926 reg_syntax_t syntax; 1927 int nest; 1928 reg_errcode_t *err; 1929{ 1930 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1931 bin_tree_t *tree; 1932 int new_idx; 1933 switch (token->type) 1934 { 1935 case CHARACTER: 1936 new_idx = re_dfa_add_node (dfa, *token, 0); 1937 tree = create_tree (NULL, NULL, 0, new_idx); 1938 if (BE (new_idx == -1 || tree == NULL, 0)) 1939 { 1940 *err = REG_ESPACE; 1941 return NULL; 1942 } 1943#ifdef RE_ENABLE_I18N 1944 if (re_mb_cur_max > 1) 1945 { 1946 while (!re_string_eoi (regexp) 1947 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 1948 { 1949 bin_tree_t *mbc_remain; 1950 *token = fetch_token (regexp, syntax); 1951 new_idx = re_dfa_add_node (dfa, *token, 0); 1952 mbc_remain = create_tree (NULL, NULL, 0, new_idx); 1953 tree = create_tree (tree, mbc_remain, CONCAT, 0); 1954 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0)) 1955 { 1956 *err = REG_ESPACE; 1957 return NULL; 1958 } 1959 } 1960 } 1961#endif 1962 break; 1963 case OP_OPEN_SUBEXP: 1964 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 1965 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 1966 return NULL; 1967 break; 1968 case OP_OPEN_BRACKET: 1969 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 1970 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 1971 return NULL; 1972 break; 1973 case OP_BACK_REF: 1974 if (BE (preg->re_nsub < token->opr.idx 1975 || dfa->subexps[token->opr.idx - 1].end == -1, 0)) 1976 { 1977 *err = REG_ESUBREG; 1978 return NULL; 1979 } 1980 dfa->used_bkref_map |= 1UL << (token->opr.idx - 1); 1981 new_idx = re_dfa_add_node (dfa, *token, 0); 1982 tree = create_tree (NULL, NULL, 0, new_idx); 1983 if (BE (new_idx == -1 || tree == NULL, 0)) 1984 { 1985 *err = REG_ESPACE; 1986 return NULL; 1987 } 1988 ++dfa->nbackref; 1989 dfa->has_mb_node = 1; 1990 break; 1991 case OP_DUP_ASTERISK: 1992 case OP_DUP_PLUS: 1993 case OP_DUP_QUESTION: 1994 case OP_OPEN_DUP_NUM: 1995 if (syntax & RE_CONTEXT_INVALID_OPS) 1996 { 1997 *err = REG_BADRPT; 1998 return NULL; 1999 } 2000 else if (syntax & RE_CONTEXT_INDEP_OPS) 2001 { 2002 *token = fetch_token (regexp, syntax); 2003 return parse_expression (regexp, preg, token, syntax, nest, err); 2004 } 2005 /* else fall through */ 2006 case OP_CLOSE_SUBEXP: 2007 if ((token->type == OP_CLOSE_SUBEXP) && 2008 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2009 { 2010 *err = REG_ERPAREN; 2011 return NULL; 2012 } 2013 /* else fall through */ 2014 case OP_CLOSE_DUP_NUM: 2015 /* We treat it as a normal character. */ 2016 2017 /* Then we can these characters as normal characters. */ 2018 token->type = CHARACTER; 2019 new_idx = re_dfa_add_node (dfa, *token, 0); 2020 tree = create_tree (NULL, NULL, 0, new_idx); 2021 if (BE (new_idx == -1 || tree == NULL, 0)) 2022 { 2023 *err = REG_ESPACE; 2024 return NULL; 2025 } 2026 break; 2027 case ANCHOR: 2028 if (dfa->word_char == NULL) 2029 { 2030 *err = init_word_char (dfa); 2031 if (BE (*err != REG_NOERROR, 0)) 2032 return NULL; 2033 } 2034 if (token->opr.ctx_type == WORD_DELIM) 2035 { 2036 bin_tree_t *tree_first, *tree_last; 2037 int idx_first, idx_last; 2038 token->opr.ctx_type = WORD_FIRST; 2039 idx_first = re_dfa_add_node (dfa, *token, 0); 2040 tree_first = create_tree (NULL, NULL, 0, idx_first); 2041 token->opr.ctx_type = WORD_LAST; 2042 idx_last = re_dfa_add_node (dfa, *token, 0); 2043 tree_last = create_tree (NULL, NULL, 0, idx_last); 2044 token->type = OP_ALT; 2045 new_idx = re_dfa_add_node (dfa, *token, 0); 2046 tree = create_tree (tree_first, tree_last, 0, new_idx); 2047 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1 2048 || tree_first == NULL || tree_last == NULL 2049 || tree == NULL, 0)) 2050 { 2051 *err = REG_ESPACE; 2052 return NULL; 2053 } 2054 } 2055 else 2056 { 2057 new_idx = re_dfa_add_node (dfa, *token, 0); 2058 tree = create_tree (NULL, NULL, 0, new_idx); 2059 if (BE (new_idx == -1 || tree == NULL, 0)) 2060 { 2061 *err = REG_ESPACE; 2062 return NULL; 2063 } 2064 } 2065 /* We must return here, since ANCHORs can't be followed 2066 by repetition operators. 2067 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2068 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2069 *token = fetch_token (regexp, syntax); 2070 return tree; 2071 case OP_PERIOD: 2072 new_idx = re_dfa_add_node (dfa, *token, 0); 2073 tree = create_tree (NULL, NULL, 0, new_idx); 2074 if (BE (new_idx == -1 || tree == NULL, 0)) 2075 { 2076 *err = REG_ESPACE; 2077 return NULL; 2078 } 2079#ifdef RE_ENABLE_I18N 2080 if (re_mb_cur_max > 1) 2081 dfa->has_mb_node = 1; 2082#endif 2083 break; 2084 case OP_WORD: 2085 tree = build_word_op (dfa, regexp->trans, 0, err); 2086 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2087 return NULL; 2088 break; 2089 case OP_NOTWORD: 2090 tree = build_word_op (dfa, regexp->trans, 1, err); 2091 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2092 return NULL; 2093 break; 2094 case OP_ALT: 2095 case END_OF_RE: 2096 return NULL; 2097 case BACK_SLASH: 2098 *err = REG_EESCAPE; 2099 return NULL; 2100 default: 2101 /* Must not happen? */ 2102#ifdef DEBUG 2103 assert (0); 2104#endif 2105 return NULL; 2106 } 2107 *token = fetch_token (regexp, syntax); 2108 2109 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2110 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2111 { 2112 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2113 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2114 return NULL; 2115 dfa->has_plural_match = 1; 2116 } 2117 2118 return tree; 2119} 2120 2121/* This function build the following tree, from regular expression 2122 (<reg_exp>): 2123 SUBEXP 2124 | 2125 <reg_exp> 2126*/ 2127 2128static bin_tree_t * 2129parse_sub_exp (regexp, preg, token, syntax, nest, err) 2130 re_string_t *regexp; 2131 regex_t *preg; 2132 re_token_t *token; 2133 reg_syntax_t syntax; 2134 int nest; 2135 reg_errcode_t *err; 2136{ 2137 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2138 bin_tree_t *tree, *left_par, *right_par; 2139 size_t cur_nsub; 2140 int new_idx; 2141 cur_nsub = preg->re_nsub++; 2142 if (dfa->subexps_alloc < preg->re_nsub) 2143 { 2144 re_subexp_t *new_array; 2145 dfa->subexps_alloc *= 2; 2146 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc); 2147 if (BE (new_array == NULL, 0)) 2148 { 2149 dfa->subexps_alloc /= 2; 2150 *err = REG_ESPACE; 2151 return NULL; 2152 } 2153 dfa->subexps = new_array; 2154 } 2155 dfa->subexps[cur_nsub].start = dfa->nodes_len; 2156 dfa->subexps[cur_nsub].end = -1; 2157 2158 new_idx = re_dfa_add_node (dfa, *token, 0); 2159 left_par = create_tree (NULL, NULL, 0, new_idx); 2160 if (BE (new_idx == -1 || left_par == NULL, 0)) 2161 { 2162 *err = REG_ESPACE; 2163 return NULL; 2164 } 2165 dfa->nodes[new_idx].opr.idx = cur_nsub; 2166 *token = fetch_token (regexp, syntax); 2167 2168 /* The subexpression may be a null string. */ 2169 if (token->type == OP_CLOSE_SUBEXP) 2170 tree = NULL; 2171 else 2172 { 2173 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2174 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2175 return NULL; 2176 } 2177 if (BE (token->type != OP_CLOSE_SUBEXP, 0)) 2178 { 2179 free_bin_tree (tree); 2180 *err = REG_BADPAT; 2181 return NULL; 2182 } 2183 new_idx = re_dfa_add_node (dfa, *token, 0); 2184 dfa->subexps[cur_nsub].end = dfa->nodes_len; 2185 right_par = create_tree (NULL, NULL, 0, new_idx); 2186 tree = ((tree == NULL) ? right_par 2187 : create_tree (tree, right_par, CONCAT, 0)); 2188 tree = create_tree (left_par, tree, CONCAT, 0); 2189 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0)) 2190 { 2191 *err = REG_ESPACE; 2192 return NULL; 2193 } 2194 dfa->nodes[new_idx].opr.idx = cur_nsub; 2195 2196 return tree; 2197} 2198 2199/* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2200 2201static bin_tree_t * 2202parse_dup_op (dup_elem, regexp, dfa, token, syntax, err) 2203 bin_tree_t *dup_elem; 2204 re_string_t *regexp; 2205 re_dfa_t *dfa; 2206 re_token_t *token; 2207 reg_syntax_t syntax; 2208 reg_errcode_t *err; 2209{ 2210 re_token_t dup_token; 2211 bin_tree_t *tree = dup_elem, *work_tree; 2212 int new_idx, start_idx = re_string_cur_idx (regexp); 2213 re_token_t start_token; 2214 start_token = *token; 2215 if (token->type == OP_OPEN_DUP_NUM) 2216 { 2217 int i; 2218 int end = 0; 2219 int start = fetch_number (regexp, token, syntax); 2220 bin_tree_t *elem; 2221 if (start == -1) 2222 { 2223 if (token->type == CHARACTER && token->opr.c == ',') 2224 start = 0; /* We treat "{,m}" as "{0,m}". */ 2225 else 2226 { 2227 *err = REG_BADBR; /* <re>{} is invalid. */ 2228 return NULL; 2229 } 2230 } 2231 if (BE (start != -2, 1)) 2232 { 2233 /* We treat "{n}" as "{n,n}". */ 2234 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2235 : ((token->type == CHARACTER && token->opr.c == ',') 2236 ? fetch_number (regexp, token, syntax) : -2)); 2237 } 2238 if (BE (start == -2 || end == -2, 0)) 2239 { 2240 /* Invalid sequence. */ 2241 if (token->type == OP_CLOSE_DUP_NUM) 2242 goto parse_dup_op_invalid_interval; 2243 else 2244 goto parse_dup_op_ebrace; 2245 } 2246 if (BE (start == 0 && end == 0, 0)) 2247 { 2248 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */ 2249 *token = fetch_token (regexp, syntax); 2250 free_bin_tree (dup_elem); 2251 return NULL; 2252 } 2253 2254 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2255 elem = tree; 2256 for (i = 0; i < start; ++i) 2257 if (i != 0) 2258 { 2259 work_tree = duplicate_tree (elem, dfa); 2260 tree = create_tree (tree, work_tree, CONCAT, 0); 2261 if (BE (work_tree == NULL || tree == NULL, 0)) 2262 goto parse_dup_op_espace; 2263 } 2264 2265 if (end == -1) 2266 { 2267 /* We treat "<re>{0,}" as "<re>*". */ 2268 dup_token.type = OP_DUP_ASTERISK; 2269 if (start > 0) 2270 { 2271 elem = duplicate_tree (elem, dfa); 2272 new_idx = re_dfa_add_node (dfa, dup_token, 0); 2273 work_tree = create_tree (elem, NULL, 0, new_idx); 2274 tree = create_tree (tree, work_tree, CONCAT, 0); 2275 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL 2276 || tree == NULL, 0)) 2277 goto parse_dup_op_espace; 2278 } 2279 else 2280 { 2281 new_idx = re_dfa_add_node (dfa, dup_token, 0); 2282 tree = create_tree (elem, NULL, 0, new_idx); 2283 if (BE (new_idx == -1 || tree == NULL, 0)) 2284 goto parse_dup_op_espace; 2285 } 2286 } 2287 else if (end - start > 0) 2288 { 2289 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */ 2290 dup_token.type = OP_DUP_QUESTION; 2291 if (start > 0) 2292 { 2293 elem = duplicate_tree (elem, dfa); 2294 new_idx = re_dfa_add_node (dfa, dup_token, 0); 2295 elem = create_tree (elem, NULL, 0, new_idx); 2296 tree = create_tree (tree, elem, CONCAT, 0); 2297 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0)) 2298 goto parse_dup_op_espace; 2299 } 2300 else 2301 { 2302 new_idx = re_dfa_add_node (dfa, dup_token, 0); 2303 tree = elem = create_tree (elem, NULL, 0, new_idx); 2304 if (BE (new_idx == -1 || tree == NULL, 0)) 2305 goto parse_dup_op_espace; 2306 } 2307 for (i = 1; i < end - start; ++i) 2308 { 2309 work_tree = duplicate_tree (elem, dfa); 2310 tree = create_tree (tree, work_tree, CONCAT, 0); 2311 if (BE (work_tree == NULL || tree == NULL, 0)) 2312 { 2313 *err = REG_ESPACE; 2314 return NULL; 2315 } 2316 } 2317 } 2318 } 2319 else 2320 { 2321 new_idx = re_dfa_add_node (dfa, *token, 0); 2322 tree = create_tree (tree, NULL, 0, new_idx); 2323 if (BE (new_idx == -1 || tree == NULL, 0)) 2324 { 2325 *err = REG_ESPACE; 2326 return NULL; 2327 } 2328 } 2329 *token = fetch_token (regexp, syntax); 2330 return tree; 2331 2332 parse_dup_op_espace: 2333 free_bin_tree (tree); 2334 *err = REG_ESPACE; 2335 return NULL; 2336 2337 parse_dup_op_ebrace: 2338 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2339 { 2340 *err = REG_EBRACE; 2341 return NULL; 2342 } 2343 goto parse_dup_op_rollback; 2344 parse_dup_op_invalid_interval: 2345 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2346 { 2347 *err = REG_BADBR; 2348 return NULL; 2349 } 2350 parse_dup_op_rollback: 2351 re_string_set_index (regexp, start_idx); 2352 *token = start_token; 2353 token->type = CHARACTER; 2354 return dup_elem; 2355} 2356 2357/* Size of the names for collating symbol/equivalence_class/character_class. 2358 I'm not sure, but maybe enough. */ 2359#define BRACKET_NAME_BUF_SIZE 32 2360 2361#ifndef _LIBC 2362 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2363 Build the range expression which starts from START_ELEM, and ends 2364 at END_ELEM. The result are written to MBCSET and SBCSET. 2365 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2366 mbcset->range_ends, is a pointer argument sinse we may 2367 update it. */ 2368 2369static reg_errcode_t 2370# ifdef RE_ENABLE_I18N 2371build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2372 re_charset_t *mbcset; 2373 int *range_alloc; 2374# else /* not RE_ENABLE_I18N */ 2375build_range_exp (sbcset, start_elem, end_elem) 2376# endif /* not RE_ENABLE_I18N */ 2377 re_bitset_ptr_t sbcset; 2378 bracket_elem_t *start_elem, *end_elem; 2379{ 2380 unsigned int start_ch, end_ch; 2381 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2382 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2383 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2384 0)) 2385 return REG_ERANGE; 2386 2387 /* We can handle no multi character collating elements without libc 2388 support. */ 2389 if (BE ((start_elem->type == COLL_SYM 2390 && strlen ((char *) start_elem->opr.name) > 1) 2391 || (end_elem->type == COLL_SYM 2392 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2393 return REG_ECOLLATE; 2394 2395# ifdef RE_ENABLE_I18N 2396 { 2397 wchar_t wc, start_wc, end_wc; 2398 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2399 2400 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2401 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2402 : 0)); 2403 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2404 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2405 : 0)); 2406 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2407 ? __btowc (start_ch) : start_elem->opr.wch); 2408 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2409 ? __btowc (end_ch) : end_elem->opr.wch); 2410 cmp_buf[0] = start_wc; 2411 cmp_buf[4] = end_wc; 2412 if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2413 return REG_ERANGE; 2414 2415 /* Check the space of the arrays. */ 2416 if (*range_alloc == mbcset->nranges) 2417 { 2418 /* There are not enough space, need realloc. */ 2419 wchar_t *new_array_start, *new_array_end; 2420 int new_nranges; 2421 2422 /* +1 in case of mbcset->nranges is 0. */ 2423 new_nranges = 2 * mbcset->nranges + 1; 2424 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2425 are NULL if *range_alloc == 0. */ 2426 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2427 new_nranges); 2428 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2429 new_nranges); 2430 2431 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2432 return REG_ESPACE; 2433 2434 mbcset->range_starts = new_array_start; 2435 mbcset->range_ends = new_array_end; 2436 *range_alloc = new_nranges; 2437 } 2438 2439 mbcset->range_starts[mbcset->nranges] = start_wc; 2440 mbcset->range_ends[mbcset->nranges++] = end_wc; 2441 2442 /* Build the table for single byte characters. */ 2443 for (wc = 0; wc <= SBC_MAX; ++wc) 2444 { 2445 cmp_buf[2] = wc; 2446 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2447 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2448 bitset_set (sbcset, wc); 2449 } 2450 } 2451# else /* not RE_ENABLE_I18N */ 2452 { 2453 unsigned int ch; 2454 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2455 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2456 : 0)); 2457 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2458 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2459 : 0)); 2460 if (start_ch > end_ch) 2461 return REG_ERANGE; 2462 /* Build the table for single byte characters. */ 2463 for (ch = 0; ch <= SBC_MAX; ++ch) 2464 if (start_ch <= ch && ch <= end_ch) 2465 bitset_set (sbcset, ch); 2466 } 2467# endif /* not RE_ENABLE_I18N */ 2468 return REG_NOERROR; 2469} 2470#endif /* not _LIBC */ 2471 2472#ifndef _LIBC 2473/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2474 Build the collating element which is represented by NAME. 2475 The result are written to MBCSET and SBCSET. 2476 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2477 pointer argument since we may update it. */ 2478 2479static reg_errcode_t 2480# ifdef RE_ENABLE_I18N 2481build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2482 re_charset_t *mbcset; 2483 int *coll_sym_alloc; 2484# else /* not RE_ENABLE_I18N */ 2485build_collating_symbol (sbcset, name) 2486# endif /* not RE_ENABLE_I18N */ 2487 re_bitset_ptr_t sbcset; 2488 const unsigned char *name; 2489{ 2490 size_t name_len = strlen ((const char *) name); 2491 if (BE (name_len != 1, 0)) 2492 return REG_ECOLLATE; 2493 else 2494 { 2495 bitset_set (sbcset, name[0]); 2496 return REG_NOERROR; 2497 } 2498} 2499#endif /* not _LIBC */ 2500 2501/* This function parse bracket expression like "[abc]", "[a-c]", 2502 "[[.a-a.]]" etc. */ 2503 2504static bin_tree_t * 2505parse_bracket_exp (regexp, dfa, token, syntax, err) 2506 re_string_t *regexp; 2507 re_dfa_t *dfa; 2508 re_token_t *token; 2509 reg_syntax_t syntax; 2510 reg_errcode_t *err; 2511{ 2512#ifdef _LIBC 2513 const unsigned char *collseqmb; 2514 const char *collseqwc; 2515 uint32_t nrules; 2516 int32_t table_size; 2517 const int32_t *symb_table; 2518 const unsigned char *extra; 2519 2520 /* Local function for parse_bracket_exp used in _LIBC environement. 2521 Seek the collating symbol entry correspondings to NAME. 2522 Return the index of the symbol in the SYMB_TABLE. */ 2523 2524 static inline int32_t 2525 seek_collating_symbol_entry (name, name_len) 2526 const unsigned char *name; 2527 size_t name_len; 2528 { 2529 int32_t hash = elem_hash ((const char *) name, name_len); 2530 int32_t elem = hash % table_size; 2531 int32_t second = hash % (table_size - 2); 2532 while (symb_table[2 * elem] != 0) 2533 { 2534 /* First compare the hashing value. */ 2535 if (symb_table[2 * elem] == hash 2536 /* Compare the length of the name. */ 2537 && name_len == extra[symb_table[2 * elem + 1]] 2538 /* Compare the name. */ 2539 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2540 name_len) == 0) 2541 { 2542 /* Yep, this is the entry. */ 2543 break; 2544 } 2545 2546 /* Next entry. */ 2547 elem += second; 2548 } 2549 return elem; 2550 } 2551 2552 /* Local function for parse_bracket_exp used in _LIBC environement. 2553 Look up the collation sequence value of BR_ELEM. 2554 Return the value if succeeded, UINT_MAX otherwise. */ 2555 2556 static inline unsigned int 2557 lookup_collation_sequence_value (br_elem) 2558 bracket_elem_t *br_elem; 2559 { 2560 if (br_elem->type == SB_CHAR) 2561 { 2562 /* 2563 if (re_mb_cur_max == 1) 2564 */ 2565 if (nrules == 0) 2566 return collseqmb[br_elem->opr.ch]; 2567 else 2568 { 2569 wint_t wc = __btowc (br_elem->opr.ch); 2570 return collseq_table_lookup (collseqwc, wc); 2571 } 2572 } 2573 else if (br_elem->type == MB_CHAR) 2574 { 2575 return collseq_table_lookup (collseqwc, br_elem->opr.wch); 2576 } 2577 else if (br_elem->type == COLL_SYM) 2578 { 2579 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2580 if (nrules != 0) 2581 { 2582 int32_t elem, idx; 2583 elem = seek_collating_symbol_entry (br_elem->opr.name, 2584 sym_name_len); 2585 if (symb_table[2 * elem] != 0) 2586 { 2587 /* We found the entry. */ 2588 idx = symb_table[2 * elem + 1]; 2589 /* Skip the name of collating element name. */ 2590 idx += 1 + extra[idx]; 2591 /* Skip the byte sequence of the collating element. */ 2592 idx += 1 + extra[idx]; 2593 /* Adjust for the alignment. */ 2594 idx = (idx + 3) & ~3; 2595 /* Skip the multibyte collation sequence value. */ 2596 idx += sizeof (unsigned int); 2597 /* Skip the wide char sequence of the collating element. */ 2598 idx += sizeof (unsigned int) * 2599 (1 + *(unsigned int *) (extra + idx)); 2600 /* Return the collation sequence value. */ 2601 return *(unsigned int *) (extra + idx); 2602 } 2603 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2604 { 2605 /* No valid character. Match it as a single byte 2606 character. */ 2607 return collseqmb[br_elem->opr.name[0]]; 2608 } 2609 } 2610 else if (sym_name_len == 1) 2611 return collseqmb[br_elem->opr.name[0]]; 2612 } 2613 return UINT_MAX; 2614 } 2615 2616 /* Local function for parse_bracket_exp used in _LIBC environement. 2617 Build the range expression which starts from START_ELEM, and ends 2618 at END_ELEM. The result are written to MBCSET and SBCSET. 2619 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2620 mbcset->range_ends, is a pointer argument sinse we may 2621 update it. */ 2622 2623 static inline reg_errcode_t 2624# ifdef RE_ENABLE_I18N 2625 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2626 re_charset_t *mbcset; 2627 int *range_alloc; 2628# else /* not RE_ENABLE_I18N */ 2629 build_range_exp (sbcset, start_elem, end_elem) 2630# endif /* not RE_ENABLE_I18N */ 2631 re_bitset_ptr_t sbcset; 2632 bracket_elem_t *start_elem, *end_elem; 2633 { 2634 unsigned int ch; 2635 uint32_t start_collseq; 2636 uint32_t end_collseq; 2637 2638# ifdef RE_ENABLE_I18N 2639 /* Check the space of the arrays. */ 2640 if (*range_alloc == mbcset->nranges) 2641 { 2642 /* There are not enough space, need realloc. */ 2643 uint32_t *new_array_start; 2644 uint32_t *new_array_end; 2645 int new_nranges; 2646 2647 /* +1 in case of mbcset->nranges is 0. */ 2648 new_nranges = 2 * mbcset->nranges + 1; 2649 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2650 are NULL if *range_alloc == 0. */ 2651 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2652 new_nranges); 2653 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2654 new_nranges); 2655 2656 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2657 return REG_ESPACE; 2658 2659 mbcset->range_starts = new_array_start; 2660 mbcset->range_ends = new_array_end; 2661 *range_alloc = new_nranges; 2662 } 2663# endif /* RE_ENABLE_I18N */ 2664 2665 /* Equivalence Classes and Character Classes can't be a range 2666 start/end. */ 2667 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2668 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2669 0)) 2670 return REG_ERANGE; 2671 2672 start_collseq = lookup_collation_sequence_value (start_elem); 2673 end_collseq = lookup_collation_sequence_value (end_elem); 2674 /* Check start/end collation sequence values. */ 2675 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2676 return REG_ECOLLATE; 2677 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2678 return REG_ERANGE; 2679 2680# ifdef RE_ENABLE_I18N 2681 /* Got valid collation sequence values, add them as a new entry. */ 2682 mbcset->range_starts[mbcset->nranges] = start_collseq; 2683 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2684# endif /* RE_ENABLE_I18N */ 2685 2686 /* Build the table for single byte characters. */ 2687 for (ch = 0; ch <= SBC_MAX; ch++) 2688 { 2689 uint32_t ch_collseq; 2690 /* 2691 if (re_mb_cur_max == 1) 2692 */ 2693 if (nrules == 0) 2694 ch_collseq = collseqmb[ch]; 2695 else 2696 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch)); 2697 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2698 bitset_set (sbcset, ch); 2699 } 2700 return REG_NOERROR; 2701 } 2702 2703 /* Local function for parse_bracket_exp used in _LIBC environement. 2704 Build the collating element which is represented by NAME. 2705 The result are written to MBCSET and SBCSET. 2706 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2707 pointer argument sinse we may update it. */ 2708 2709 static inline reg_errcode_t 2710# ifdef RE_ENABLE_I18N 2711 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2712 re_charset_t *mbcset; 2713 int *coll_sym_alloc; 2714# else /* not RE_ENABLE_I18N */ 2715 build_collating_symbol (sbcset, name) 2716# endif /* not RE_ENABLE_I18N */ 2717 re_bitset_ptr_t sbcset; 2718 const unsigned char *name; 2719 { 2720 int32_t elem, idx; 2721 size_t name_len = strlen ((const char *) name); 2722 if (nrules != 0) 2723 { 2724 elem = seek_collating_symbol_entry (name, name_len); 2725 if (symb_table[2 * elem] != 0) 2726 { 2727 /* We found the entry. */ 2728 idx = symb_table[2 * elem + 1]; 2729 /* Skip the name of collating element name. */ 2730 idx += 1 + extra[idx]; 2731 } 2732 else if (symb_table[2 * elem] == 0 && name_len == 1) 2733 { 2734 /* No valid character, treat it as a normal 2735 character. */ 2736 bitset_set (sbcset, name[0]); 2737 return REG_NOERROR; 2738 } 2739 else 2740 return REG_ECOLLATE; 2741 2742# ifdef RE_ENABLE_I18N 2743 /* Got valid collation sequence, add it as a new entry. */ 2744 /* Check the space of the arrays. */ 2745 if (*coll_sym_alloc == mbcset->ncoll_syms) 2746 { 2747 /* Not enough, realloc it. */ 2748 /* +1 in case of mbcset->ncoll_syms is 0. */ 2749 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 2750 /* Use realloc since mbcset->coll_syms is NULL 2751 if *alloc == 0. */ 2752 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t, 2753 *coll_sym_alloc); 2754 if (BE (mbcset->coll_syms == NULL, 0)) 2755 return REG_ESPACE; 2756 } 2757 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 2758# endif /* RE_ENABLE_I18N */ 2759 return REG_NOERROR; 2760 } 2761 else 2762 { 2763 if (BE (name_len != 1, 0)) 2764 return REG_ECOLLATE; 2765 else 2766 { 2767 bitset_set (sbcset, name[0]); 2768 return REG_NOERROR; 2769 } 2770 } 2771 } 2772#endif 2773 2774 re_token_t br_token; 2775 re_bitset_ptr_t sbcset; 2776#ifdef RE_ENABLE_I18N 2777 re_charset_t *mbcset; 2778 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 2779 int equiv_class_alloc = 0, char_class_alloc = 0; 2780#else /* not RE_ENABLE_I18N */ 2781 int non_match = 0; 2782#endif /* not RE_ENABLE_I18N */ 2783 bin_tree_t *work_tree; 2784 int token_len, new_idx; 2785#ifdef _LIBC 2786 collseqmb = (const unsigned char *) 2787 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 2788 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 2789 if (nrules) 2790 { 2791 /* 2792 if (re_mb_cur_max > 1) 2793 */ 2794 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 2795 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 2796 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 2797 _NL_COLLATE_SYMB_TABLEMB); 2798 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 2799 _NL_COLLATE_SYMB_EXTRAMB); 2800 } 2801#endif 2802 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); 2803#ifdef RE_ENABLE_I18N 2804 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 2805#endif /* RE_ENABLE_I18N */ 2806#ifdef RE_ENABLE_I18N 2807 if (BE (sbcset == NULL || mbcset == NULL, 0)) 2808#else 2809 if (BE (sbcset == NULL, 0)) 2810#endif /* RE_ENABLE_I18N */ 2811 { 2812 *err = REG_ESPACE; 2813 return NULL; 2814 } 2815 2816 token_len = peek_token_bracket (token, regexp, syntax); 2817 if (BE (token->type == END_OF_RE, 0)) 2818 { 2819 *err = REG_BADPAT; 2820 goto parse_bracket_exp_free_return; 2821 } 2822 if (token->type == OP_NON_MATCH_LIST) 2823 { 2824#ifdef RE_ENABLE_I18N 2825 int i; 2826 mbcset->non_match = 1; 2827#else /* not RE_ENABLE_I18N */ 2828 non_match = 1; 2829#endif /* not RE_ENABLE_I18N */ 2830 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 2831 bitset_set (sbcset, '\0'); 2832 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 2833 token_len = peek_token_bracket (token, regexp, syntax); 2834 if (BE (token->type == END_OF_RE, 0)) 2835 { 2836 *err = REG_BADPAT; 2837 goto parse_bracket_exp_free_return; 2838 } 2839#ifdef RE_ENABLE_I18N 2840 if (re_mb_cur_max > 1) 2841 for (i = 0; i < SBC_MAX; ++i) 2842 if (__btowc (i) == WEOF) 2843 bitset_set (sbcset, i); 2844#endif /* RE_ENABLE_I18N */ 2845 } 2846 2847 /* We treat the first ']' as a normal character. */ 2848 if (token->type == OP_CLOSE_BRACKET) 2849 token->type = CHARACTER; 2850 2851 while (1) 2852 { 2853 bracket_elem_t start_elem, end_elem; 2854 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 2855 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 2856 reg_errcode_t ret; 2857 int token_len2 = 0, is_range_exp = 0; 2858 re_token_t token2; 2859 2860 start_elem.opr.name = start_name_buf; 2861 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 2862 syntax); 2863 if (BE (ret != REG_NOERROR, 0)) 2864 { 2865 *err = ret; 2866 goto parse_bracket_exp_free_return; 2867 } 2868 2869 token_len = peek_token_bracket (token, regexp, syntax); 2870 if (BE (token->type == END_OF_RE, 0)) 2871 { 2872 *err = REG_BADPAT; 2873 goto parse_bracket_exp_free_return; 2874 } 2875 if (token->type == OP_CHARSET_RANGE) 2876 { 2877 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 2878 token_len2 = peek_token_bracket (&token2, regexp, syntax); 2879 if (BE (token->type == END_OF_RE, 0)) 2880 { 2881 *err = REG_BADPAT; 2882 goto parse_bracket_exp_free_return; 2883 } 2884 if (token2.type == OP_CLOSE_BRACKET) 2885 { 2886 /* We treat the last '-' as a normal character. */ 2887 re_string_skip_bytes (regexp, -token_len); 2888 token->type = CHARACTER; 2889 } 2890 else 2891 is_range_exp = 1; 2892 } 2893 2894 if (is_range_exp == 1) 2895 { 2896 end_elem.opr.name = end_name_buf; 2897 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 2898 dfa, syntax); 2899 if (BE (ret != REG_NOERROR, 0)) 2900 { 2901 *err = ret; 2902 goto parse_bracket_exp_free_return; 2903 } 2904 2905 token_len = peek_token_bracket (token, regexp, syntax); 2906 if (BE (token->type == END_OF_RE, 0)) 2907 { 2908 *err = REG_BADPAT; 2909 goto parse_bracket_exp_free_return; 2910 } 2911 *err = build_range_exp (sbcset, 2912#ifdef RE_ENABLE_I18N 2913 mbcset, &range_alloc, 2914#endif /* RE_ENABLE_I18N */ 2915 &start_elem, &end_elem); 2916 if (BE (*err != REG_NOERROR, 0)) 2917 goto parse_bracket_exp_free_return; 2918 } 2919 else 2920 { 2921 switch (start_elem.type) 2922 { 2923 case SB_CHAR: 2924 bitset_set (sbcset, start_elem.opr.ch); 2925 break; 2926#ifdef RE_ENABLE_I18N 2927 case MB_CHAR: 2928 /* Check whether the array has enough space. */ 2929 if (mbchar_alloc == mbcset->nmbchars) 2930 { 2931 /* Not enough, realloc it. */ 2932 /* +1 in case of mbcset->nmbchars is 0. */ 2933 mbchar_alloc = 2 * mbcset->nmbchars + 1; 2934 /* Use realloc since array is NULL if *alloc == 0. */ 2935 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t, 2936 mbchar_alloc); 2937 if (BE (mbcset->mbchars == NULL, 0)) 2938 goto parse_bracket_exp_espace; 2939 } 2940 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 2941 break; 2942#endif /* RE_ENABLE_I18N */ 2943 case EQUIV_CLASS: 2944 *err = build_equiv_class (sbcset, 2945#ifdef RE_ENABLE_I18N 2946 mbcset, &equiv_class_alloc, 2947#endif /* RE_ENABLE_I18N */ 2948 start_elem.opr.name); 2949 if (BE (*err != REG_NOERROR, 0)) 2950 goto parse_bracket_exp_free_return; 2951 break; 2952 case COLL_SYM: 2953 *err = build_collating_symbol (sbcset, 2954#ifdef RE_ENABLE_I18N 2955 mbcset, &coll_sym_alloc, 2956#endif /* RE_ENABLE_I18N */ 2957 start_elem.opr.name); 2958 if (BE (*err != REG_NOERROR, 0)) 2959 goto parse_bracket_exp_free_return; 2960 break; 2961 case CHAR_CLASS: 2962 ret = build_charclass (regexp->trans, sbcset, 2963#ifdef RE_ENABLE_I18N 2964 mbcset, &char_class_alloc, 2965#endif /* RE_ENABLE_I18N */ 2966 start_elem.opr.name, syntax); 2967 if (BE (ret != REG_NOERROR, 0)) 2968 goto parse_bracket_exp_espace; 2969 break; 2970 default: 2971 assert (0); 2972 break; 2973 } 2974 } 2975 if (token->type == OP_CLOSE_BRACKET) 2976 break; 2977 } 2978 2979 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 2980 2981 /* If it is non-matching list. */ 2982#ifdef RE_ENABLE_I18N 2983 if (mbcset->non_match) 2984#else /* not RE_ENABLE_I18N */ 2985 if (non_match) 2986#endif /* not RE_ENABLE_I18N */ 2987 bitset_not (sbcset); 2988 2989 /* Build a tree for simple bracket. */ 2990 br_token.type = SIMPLE_BRACKET; 2991 br_token.opr.sbcset = sbcset; 2992 new_idx = re_dfa_add_node (dfa, br_token, 0); 2993 work_tree = create_tree (NULL, NULL, 0, new_idx); 2994 if (BE (new_idx == -1 || work_tree == NULL, 0)) 2995 goto parse_bracket_exp_espace; 2996 2997#ifdef RE_ENABLE_I18N 2998 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 2999 || mbcset->nranges || (re_mb_cur_max > 1 && (mbcset->nchar_classes 3000 || mbcset->non_match))) 3001 { 3002 re_token_t alt_token; 3003 bin_tree_t *mbc_tree; 3004 /* Build a tree for complex bracket. */ 3005 br_token.type = COMPLEX_BRACKET; 3006 br_token.opr.mbcset = mbcset; 3007 dfa->has_mb_node = 1; 3008 new_idx = re_dfa_add_node (dfa, br_token, 0); 3009 mbc_tree = create_tree (NULL, NULL, 0, new_idx); 3010 if (BE (new_idx == -1 || mbc_tree == NULL, 0)) 3011 goto parse_bracket_exp_espace; 3012 /* Then join them by ALT node. */ 3013 dfa->has_plural_match = 1; 3014 alt_token.type = OP_ALT; 3015 new_idx = re_dfa_add_node (dfa, alt_token, 0); 3016 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx); 3017 if (BE (new_idx != -1 && mbc_tree != NULL, 1)) 3018 return work_tree; 3019 } 3020 else 3021 { 3022 free_charset (mbcset); 3023 return work_tree; 3024 } 3025#else /* not RE_ENABLE_I18N */ 3026 return work_tree; 3027#endif /* not RE_ENABLE_I18N */ 3028 3029 parse_bracket_exp_espace: 3030 *err = REG_ESPACE; 3031 parse_bracket_exp_free_return: 3032 re_free (sbcset); 3033#ifdef RE_ENABLE_I18N 3034 free_charset (mbcset); 3035#endif /* RE_ENABLE_I18N */ 3036 return NULL; 3037} 3038 3039/* Parse an element in the bracket expression. */ 3040 3041static reg_errcode_t 3042parse_bracket_element (elem, regexp, token, token_len, dfa, syntax) 3043 bracket_elem_t *elem; 3044 re_string_t *regexp; 3045 re_token_t *token; 3046 int token_len; 3047 re_dfa_t *dfa; 3048 reg_syntax_t syntax; 3049{ 3050#ifdef RE_ENABLE_I18N 3051 int cur_char_size; 3052 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3053 if (cur_char_size > 1) 3054 { 3055 elem->type = MB_CHAR; 3056 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3057 re_string_skip_bytes (regexp, cur_char_size); 3058 return REG_NOERROR; 3059 } 3060#endif /* RE_ENABLE_I18N */ 3061 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3062 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3063 || token->type == OP_OPEN_EQUIV_CLASS) 3064 return parse_bracket_symbol (elem, regexp, token); 3065 elem->type = SB_CHAR; 3066 elem->opr.ch = token->opr.c; 3067 return REG_NOERROR; 3068} 3069 3070/* Parse a bracket symbol in the bracket expression. Bracket symbols are 3071 such as [:<character_class>:], [.<collating_element>.], and 3072 [=<equivalent_class>=]. */ 3073 3074static reg_errcode_t 3075parse_bracket_symbol (elem, regexp, token) 3076 bracket_elem_t *elem; 3077 re_string_t *regexp; 3078 re_token_t *token; 3079{ 3080 unsigned char ch, delim = token->opr.c; 3081 int i = 0; 3082 for (;; ++i) 3083 { 3084 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE) 3085 return REG_EBRACK; 3086 if (token->type == OP_OPEN_CHAR_CLASS) 3087 ch = re_string_fetch_byte_case (regexp); 3088 else 3089 ch = re_string_fetch_byte (regexp); 3090 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3091 break; 3092 elem->opr.name[i] = ch; 3093 } 3094 re_string_skip_bytes (regexp, 1); 3095 elem->opr.name[i] = '\0'; 3096 switch (token->type) 3097 { 3098 case OP_OPEN_COLL_ELEM: 3099 elem->type = COLL_SYM; 3100 break; 3101 case OP_OPEN_EQUIV_CLASS: 3102 elem->type = EQUIV_CLASS; 3103 break; 3104 case OP_OPEN_CHAR_CLASS: 3105 elem->type = CHAR_CLASS; 3106 break; 3107 default: 3108 break; 3109 } 3110 return REG_NOERROR; 3111} 3112 3113 /* Helper function for parse_bracket_exp. 3114 Build the equivalence class which is represented by NAME. 3115 The result are written to MBCSET and SBCSET. 3116 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3117 is a pointer argument sinse we may update it. */ 3118 3119static reg_errcode_t 3120#ifdef RE_ENABLE_I18N 3121build_equiv_class (sbcset, mbcset, equiv_class_alloc, name) 3122 re_charset_t *mbcset; 3123 int *equiv_class_alloc; 3124#else /* not RE_ENABLE_I18N */ 3125build_equiv_class (sbcset, name) 3126#endif /* not RE_ENABLE_I18N */ 3127 re_bitset_ptr_t sbcset; 3128 const unsigned char *name; 3129{ 3130#if defined _LIBC && defined RE_ENABLE_I18N 3131 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3132 if (nrules != 0) 3133 { 3134 const int32_t *table, *indirect; 3135 const unsigned char *weights, *extra, *cp; 3136 unsigned char char_buf[2]; 3137 int32_t idx1, idx2; 3138 unsigned int ch; 3139 size_t len; 3140 /* This #include defines a local function! */ 3141# include <locale/weight.h> 3142 /* Calculate the index for equivalence class. */ 3143 cp = name; 3144 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3145 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3146 _NL_COLLATE_WEIGHTMB); 3147 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3148 _NL_COLLATE_EXTRAMB); 3149 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3150 _NL_COLLATE_INDIRECTMB); 3151 idx1 = findidx (&cp); 3152 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3153 /* This isn't a valid character. */ 3154 return REG_ECOLLATE; 3155 3156 /* Build single byte matcing table for this equivalence class. */ 3157 char_buf[1] = (unsigned char) '\0'; 3158 len = weights[idx1]; 3159 for (ch = 0; ch < SBC_MAX; ++ch) 3160 { 3161 char_buf[0] = ch; 3162 cp = char_buf; 3163 idx2 = findidx (&cp); 3164/* 3165 idx2 = table[ch]; 3166*/ 3167 if (idx2 == 0) 3168 /* This isn't a valid character. */ 3169 continue; 3170 if (len == weights[idx2]) 3171 { 3172 int cnt = 0; 3173 while (cnt <= len && 3174 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) 3175 ++cnt; 3176 3177 if (cnt > len) 3178 bitset_set (sbcset, ch); 3179 } 3180 } 3181 /* Check whether the array has enough space. */ 3182 if (*equiv_class_alloc == mbcset->nequiv_classes) 3183 { 3184 /* Not enough, realloc it. */ 3185 /* +1 in case of mbcset->nequiv_classes is 0. */ 3186 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3187 /* Use realloc since the array is NULL if *alloc == 0. */ 3188 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t, 3189 *equiv_class_alloc); 3190 if (BE (mbcset->equiv_classes == NULL, 0)) 3191 return REG_ESPACE; 3192 } 3193 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3194 } 3195 else 3196#endif /* _LIBC && RE_ENABLE_I18N */ 3197 { 3198 if (BE (strlen ((const char *) name) != 1, 0)) 3199 return REG_ECOLLATE; 3200 bitset_set (sbcset, *name); 3201 } 3202 return REG_NOERROR; 3203} 3204 3205 /* Helper function for parse_bracket_exp. 3206 Build the character class which is represented by NAME. 3207 The result are written to MBCSET and SBCSET. 3208 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3209 is a pointer argument sinse we may update it. */ 3210 3211static reg_errcode_t 3212#ifdef RE_ENABLE_I18N 3213build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax) 3214 re_charset_t *mbcset; 3215 int *char_class_alloc; 3216#else /* not RE_ENABLE_I18N */ 3217build_charclass (trans, sbcset, class_name, syntax) 3218#endif /* not RE_ENABLE_I18N */ 3219 RE_TRANSLATE_TYPE trans; 3220 re_bitset_ptr_t sbcset; 3221 const unsigned char *class_name; 3222 reg_syntax_t syntax; 3223{ 3224 int i; 3225 const char *name = (const char *) class_name; 3226 3227 /* In case of REG_ICASE "upper" and "lower" match the both of 3228 upper and lower cases. */ 3229 if ((syntax & RE_ICASE) 3230 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3231 name = "alpha"; 3232 3233#ifdef RE_ENABLE_I18N 3234 /* Check the space of the arrays. */ 3235 if (*char_class_alloc == mbcset->nchar_classes) 3236 { 3237 /* Not enough, realloc it. */ 3238 /* +1 in case of mbcset->nchar_classes is 0. */ 3239 *char_class_alloc = 2 * mbcset->nchar_classes + 1; 3240 /* Use realloc since array is NULL if *alloc == 0. */ 3241 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t, 3242 *char_class_alloc); 3243 if (BE (mbcset->char_classes == NULL, 0)) 3244 return REG_ESPACE; 3245 } 3246 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3247#endif /* RE_ENABLE_I18N */ 3248 3249#define BUILD_CHARCLASS_LOOP(ctype_func)\ 3250 for (i = 0; i < SBC_MAX; ++i) \ 3251 { \ 3252 if (ctype_func (i)) \ 3253 { \ 3254 int ch = trans ? trans[i] : i; \ 3255 bitset_set (sbcset, ch); \ 3256 } \ 3257 } 3258 3259 if (strcmp (name, "alnum") == 0) 3260 BUILD_CHARCLASS_LOOP (isalnum) 3261 else if (strcmp (name, "cntrl") == 0) 3262 BUILD_CHARCLASS_LOOP (iscntrl) 3263 else if (strcmp (name, "lower") == 0) 3264 BUILD_CHARCLASS_LOOP (islower) 3265 else if (strcmp (name, "space") == 0) 3266 BUILD_CHARCLASS_LOOP (isspace) 3267 else if (strcmp (name, "alpha") == 0) 3268 BUILD_CHARCLASS_LOOP (isalpha) 3269 else if (strcmp (name, "digit") == 0) 3270 BUILD_CHARCLASS_LOOP (isdigit) 3271 else if (strcmp (name, "print") == 0) 3272 BUILD_CHARCLASS_LOOP (isprint) 3273 else if (strcmp (name, "upper") == 0) 3274 BUILD_CHARCLASS_LOOP (isupper) 3275 else if (strcmp (name, "blank") == 0) 3276 BUILD_CHARCLASS_LOOP (isblank) 3277 else if (strcmp (name, "graph") == 0) 3278 BUILD_CHARCLASS_LOOP (isgraph) 3279 else if (strcmp (name, "punct") == 0) 3280 BUILD_CHARCLASS_LOOP (ispunct) 3281 else if (strcmp (name, "xdigit") == 0) 3282 BUILD_CHARCLASS_LOOP (isxdigit) 3283 else 3284 return REG_ECTYPE; 3285 3286 return REG_NOERROR; 3287} 3288 3289static bin_tree_t * 3290build_word_op (dfa, trans, not, err) 3291 re_dfa_t *dfa; 3292 RE_TRANSLATE_TYPE trans; 3293 int not; 3294 reg_errcode_t *err; 3295{ 3296 re_bitset_ptr_t sbcset; 3297#ifdef RE_ENABLE_I18N 3298 re_charset_t *mbcset; 3299 int alloc = 0; 3300#else /* not RE_ENABLE_I18N */ 3301 int non_match = 0; 3302#endif /* not RE_ENABLE_I18N */ 3303 reg_errcode_t ret; 3304 re_token_t br_token; 3305 bin_tree_t *tree; 3306 int new_idx; 3307 3308 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); 3309#ifdef RE_ENABLE_I18N 3310 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3311#endif /* RE_ENABLE_I18N */ 3312 3313#ifdef RE_ENABLE_I18N 3314 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3315#else /* not RE_ENABLE_I18N */ 3316 if (BE (sbcset == NULL, 0)) 3317#endif /* not RE_ENABLE_I18N */ 3318 { 3319 *err = REG_ESPACE; 3320 return NULL; 3321 } 3322 3323 if (not) 3324 { 3325#ifdef RE_ENABLE_I18N 3326 int i; 3327 /* 3328 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3329 bitset_set(cset->sbcset, '\0'); 3330 */ 3331 mbcset->non_match = 1; 3332 if (re_mb_cur_max > 1) 3333 for (i = 0; i < SBC_MAX; ++i) 3334 if (__btowc (i) == WEOF) 3335 bitset_set (sbcset, i); 3336#else /* not RE_ENABLE_I18N */ 3337 non_match = 1; 3338#endif /* not RE_ENABLE_I18N */ 3339 } 3340 3341 /* We don't care the syntax in this case. */ 3342 ret = build_charclass (trans, sbcset, 3343#ifdef RE_ENABLE_I18N 3344 mbcset, &alloc, 3345#endif /* RE_ENABLE_I18N */ 3346 (const unsigned char *) "alpha", 0); 3347 3348 if (BE (ret != REG_NOERROR, 0)) 3349 { 3350 re_free (sbcset); 3351#ifdef RE_ENABLE_I18N 3352 free_charset (mbcset); 3353#endif /* RE_ENABLE_I18N */ 3354 *err = ret; 3355 return NULL; 3356 } 3357 /* \w match '_' also. */ 3358 bitset_set (sbcset, '_'); 3359 3360 /* If it is non-matching list. */ 3361#ifdef RE_ENABLE_I18N 3362 if (mbcset->non_match) 3363#else /* not RE_ENABLE_I18N */ 3364 if (non_match) 3365#endif /* not RE_ENABLE_I18N */ 3366 bitset_not (sbcset); 3367 3368 /* Build a tree for simple bracket. */ 3369 br_token.type = SIMPLE_BRACKET; 3370 br_token.opr.sbcset = sbcset; 3371 new_idx = re_dfa_add_node (dfa, br_token, 0); 3372 tree = create_tree (NULL, NULL, 0, new_idx); 3373 if (BE (new_idx == -1 || tree == NULL, 0)) 3374 goto build_word_op_espace; 3375 3376#ifdef RE_ENABLE_I18N 3377 if (re_mb_cur_max > 1) 3378 { 3379 re_token_t alt_token; 3380 bin_tree_t *mbc_tree; 3381 /* Build a tree for complex bracket. */ 3382 br_token.type = COMPLEX_BRACKET; 3383 br_token.opr.mbcset = mbcset; 3384 dfa->has_mb_node = 1; 3385 new_idx = re_dfa_add_node (dfa, br_token, 0); 3386 mbc_tree = create_tree (NULL, NULL, 0, new_idx); 3387 if (BE (new_idx == -1 || mbc_tree == NULL, 0)) 3388 goto build_word_op_espace; 3389 /* Then join them by ALT node. */ 3390 alt_token.type = OP_ALT; 3391 new_idx = re_dfa_add_node (dfa, alt_token, 0); 3392 tree = create_tree (tree, mbc_tree, 0, new_idx); 3393 if (BE (new_idx != -1 && mbc_tree != NULL, 1)) 3394 return tree; 3395 } 3396 else 3397 { 3398 free_charset (mbcset); 3399 return tree; 3400 } 3401#else /* not RE_ENABLE_I18N */ 3402 return tree; 3403#endif /* not RE_ENABLE_I18N */ 3404 3405 build_word_op_espace: 3406 re_free (sbcset); 3407#ifdef RE_ENABLE_I18N 3408 free_charset (mbcset); 3409#endif /* RE_ENABLE_I18N */ 3410 *err = REG_ESPACE; 3411 return NULL; 3412} 3413 3414/* This is intended for the expressions like "a{1,3}". 3415 Fetch a number from `input', and return the number. 3416 Return -1, if the number field is empty like "{,1}". 3417 Return -2, If an error is occured. */ 3418 3419static int 3420fetch_number (input, token, syntax) 3421 re_string_t *input; 3422 re_token_t *token; 3423 reg_syntax_t syntax; 3424{ 3425 int num = -1; 3426 unsigned char c; 3427 while (1) 3428 { 3429 *token = fetch_token (input, syntax); 3430 c = token->opr.c; 3431 if (BE (token->type == END_OF_RE, 0)) 3432 return -2; 3433 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3434 break; 3435 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) 3436 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0')); 3437 num = (num > RE_DUP_MAX) ? -2 : num; 3438 } 3439 return num; 3440} 3441 3442#ifdef RE_ENABLE_I18N 3443static void 3444free_charset (re_charset_t *cset) 3445{ 3446 re_free (cset->mbchars); 3447# ifdef _LIBC 3448 re_free (cset->coll_syms); 3449 re_free (cset->equiv_classes); 3450 re_free (cset->range_starts); 3451 re_free (cset->range_ends); 3452# endif 3453 re_free (cset->char_classes); 3454 re_free (cset); 3455} 3456#endif /* RE_ENABLE_I18N */ 3457 3458/* Functions for binary tree operation. */ 3459 3460/* Create a node of tree. 3461 Note: This function automatically free left and right if malloc fails. */ 3462 3463static bin_tree_t * 3464create_tree (left, right, type, index) 3465 bin_tree_t *left; 3466 bin_tree_t *right; 3467 re_token_type_t type; 3468 int index; 3469{ 3470 bin_tree_t *tree; 3471 tree = re_malloc (bin_tree_t, 1); 3472 if (BE (tree == NULL, 0)) 3473 { 3474 free_bin_tree (left); 3475 free_bin_tree (right); 3476 return NULL; 3477 } 3478 tree->parent = NULL; 3479 tree->left = left; 3480 tree->right = right; 3481 tree->type = type; 3482 tree->node_idx = index; 3483 tree->first = -1; 3484 tree->next = -1; 3485 re_node_set_init_empty (&tree->eclosure); 3486 3487 if (left != NULL) 3488 left->parent = tree; 3489 if (right != NULL) 3490 right->parent = tree; 3491 return tree; 3492} 3493 3494/* Free the sub tree pointed by TREE. */ 3495 3496static void 3497free_bin_tree (tree) 3498 bin_tree_t *tree; 3499{ 3500 if (tree == NULL) 3501 return; 3502 /*re_node_set_free (&tree->eclosure);*/ 3503 free_bin_tree (tree->left); 3504 free_bin_tree (tree->right); 3505 re_free (tree); 3506} 3507 3508/* Duplicate the node SRC, and return new node. */ 3509 3510static bin_tree_t * 3511duplicate_tree (src, dfa) 3512 const bin_tree_t *src; 3513 re_dfa_t *dfa; 3514{ 3515 bin_tree_t *left = NULL, *right = NULL, *new_tree; 3516 int new_node_idx; 3517 /* Since node indies must be according to Post-order of the tree, 3518 we must duplicate the left at first. */ 3519 if (src->left != NULL) 3520 { 3521 left = duplicate_tree (src->left, dfa); 3522 if (left == NULL) 3523 return NULL; 3524 } 3525 3526 /* Secondaly, duplicate the right. */ 3527 if (src->right != NULL) 3528 { 3529 right = duplicate_tree (src->right, dfa); 3530 if (right == NULL) 3531 { 3532 free_bin_tree (left); 3533 return NULL; 3534 } 3535 } 3536 3537 /* At last, duplicate itself. */ 3538 if (src->type == NON_TYPE) 3539 { 3540 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0); 3541 dfa->nodes[new_node_idx].duplicated = 1; 3542 if (BE (new_node_idx == -1, 0)) 3543 { 3544 free_bin_tree (left); 3545 free_bin_tree (right); 3546 return NULL; 3547 } 3548 } 3549 else 3550 new_node_idx = src->type; 3551 3552 new_tree = create_tree (left, right, src->type, new_node_idx); 3553 if (BE (new_tree == NULL, 0)) 3554 { 3555 free_bin_tree (left); 3556 free_bin_tree (right); 3557 } 3558 return new_tree; 3559} 3560