1/* 2 tre-parse.c - Regexp parser 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7*/ 8 9/* 10 This parser is just a simple recursive descent parser for POSIX.2 11 regexps. The parser supports both the obsolete default syntax and 12 the "extended" syntax, and some nonstandard extensions. 13*/ 14 15 16#ifdef HAVE_CONFIG_H 17#include <config.h> 18#endif /* HAVE_CONFIG_H */ 19#include <string.h> 20#include <assert.h> 21#include <limits.h> 22#include <stddef.h> 23 24#include "xmalloc.h" 25#include "tre-mem.h" 26#include "tre-ast.h" 27#include "tre-stack.h" 28#include "tre-parse.h" 29 30/* BSD compatibility: 31 Before looking up a collating symbol, check if the name matches in 32 the character names (cnames) array; if so, use the corresponding 33 character. 34 35 Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve 36 the implementation choice that for ERE, a non-numeric character following 37 a left brace that would normally be a bound, causes the left brace to be 38 literal. */ 39#define BSD_COMPATIBILITY 40#ifdef BSD_COMPATIBILITY 41#include "cname.h" 42#define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 43#endif /* BSD_COMPATIBILITY */ 44 45/* Characters with special meanings in regexp syntax. */ 46#define CHAR_PIPE L'|' 47#define CHAR_LPAREN L'(' 48#define CHAR_RPAREN L')' 49#define CHAR_LBRACE L'{' 50#define CHAR_RBRACE L'}' 51#define CHAR_LBRACKET L'[' 52#define CHAR_RBRACKET L']' 53#define CHAR_MINUS L'-' 54#define CHAR_STAR L'*' 55#define CHAR_QUESTIONMARK L'?' 56#define CHAR_PLUS L'+' 57#define CHAR_PERIOD L'.' 58#define CHAR_COLON L':' 59#define CHAR_EQUAL L'=' 60#define CHAR_COMMA L',' 61#define CHAR_CARET L'^' 62#define CHAR_DOLLAR L'$' 63#define CHAR_BACKSLASH L'\\' 64#define CHAR_HASH L'#' 65#define CHAR_TILDE L'~' 66 67 68/* Some macros for expanding \w, \s, etc. */ 69static const struct tre_macro_struct { 70 const char c; 71 const char *expansion; 72} tre_macros[] = 73 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, 74 {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, 75 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, 76 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, 77 { 0, NULL } 78 }; 79 80 81/* Expands a macro delimited by `regex' and `regex_end' to `buf', which 82 must have at least `len' items. Sets buf[0] to zero if the there 83 is no match in `tre_macros'. */ 84static void 85tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end, 86 tre_char_t *buf, size_t buf_len) 87{ 88 int i; 89 90 buf[0] = 0; 91 if (regex >= regex_end) 92 return; 93 94 for (i = 0; tre_macros[i].expansion; i++) 95 { 96 if (tre_macros[i].c == *regex) 97 { 98 unsigned int j; 99 DPRINT(("Expanding macro '%c' => '%s'\n", 100 tre_macros[i].c, tre_macros[i].expansion)); 101 for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++) 102 buf[j] = tre_macros[i].expansion[j]; 103 buf[j] = 0; 104 break; 105 } 106 } 107} 108 109static reg_errcode_t 110tre_new_item(tre_mem_t mem, int type, int val, int *max_i, 111 tre_bracket_match_list_t **items) 112{ 113 reg_errcode_t status = REG_OK; 114 tre_bracket_match_list_t *array = *items; 115 int i = array->num_bracket_matches; 116 /* Allocate more space if necessary. */ 117 if (i >= *max_i) 118 { 119 tre_bracket_match_list_t *new_items; 120 DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i)); 121 /* If the array is already 1024 items large, give up -- there's 122 probably an error in the regexp (e.g. not a '\0' terminated 123 string and missing ']') */ 124 if (*max_i >= 1024) 125 return REG_ESPACE; 126 *max_i *= 2; 127 new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i)); 128 if (new_items == NULL) 129 return REG_ESPACE; 130 *items = array = new_items; 131 } 132 array->bracket_matches[i].type = type; 133 array->bracket_matches[i].value = val; 134 array->num_bracket_matches++; 135 return status; 136} 137 138#ifndef TRE_USE_SYSTEM_WCTYPE 139 140/* isalnum() and the rest may be macros, so wrap them to functions. */ 141int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); } 142int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); } 143 144#ifdef tre_isascii 145int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); } 146#else /* !tre_isascii */ 147int tre_isascii_func(tre_cint_t c) { return !(c >> 7); } 148#endif /* !tre_isascii */ 149 150#ifdef tre_isblank 151int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); } 152#else /* !tre_isblank */ 153int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); } 154#endif /* !tre_isblank */ 155 156int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); } 157int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); } 158int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); } 159int tre_islower_func(tre_cint_t c) { return tre_islower(c); } 160int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); } 161int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); } 162int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); } 163int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); } 164int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); } 165 166struct { 167 char *name; 168 int (*func)(tre_cint_t); 169} tre_ctype_map[] = { 170 { "alnum", &tre_isalnum_func }, 171 { "alpha", &tre_isalpha_func }, 172#ifdef tre_isascii 173 { "ascii", &tre_isascii_func }, 174#endif /* tre_isascii */ 175#ifdef tre_isblank 176 { "blank", &tre_isblank_func }, 177#endif /* tre_isblank */ 178 { "cntrl", &tre_iscntrl_func }, 179 { "digit", &tre_isdigit_func }, 180 { "graph", &tre_isgraph_func }, 181 { "lower", &tre_islower_func }, 182 { "print", &tre_isprint_func }, 183 { "punct", &tre_ispunct_func }, 184 { "space", &tre_isspace_func }, 185 { "upper", &tre_isupper_func }, 186 { "xdigit", &tre_isxdigit_func }, 187 { NULL, NULL} 188}; 189 190tre_ctype_t tre_ctype(const char *name) 191{ 192 int i; 193 for (i = 0; tre_ctype_map[i].name != NULL; i++) 194 { 195 if (strcmp(name, tre_ctype_map[i].name) == 0) 196 return tre_ctype_map[i].func; 197 } 198 return (tre_ctype_t)0; 199} 200#endif /* !TRE_USE_SYSTEM_WCTYPE */ 201 202#define REST(re) (int)(ctx->re_end - (re)), (re) 203 204#define START_COLLATING_SYMBOLS 16 205#define MAX_COLLATING_SYMBOL_LEN 4 206 207typedef struct { 208 const tre_char_t *start; 209 int len; 210} tre_collating_symbol; 211 212#include <xlocale.h> 213 214int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len); 215 216#ifdef BSD_COMPATIBILITY 217static wchar_t 218tre_search_cnames(const wchar_t *name, size_t len) 219{ 220 size_t low = 0; 221 size_t high = NCNAMES - 1; 222 size_t cur; 223 int cmp; 224 225 while(low <= high) 226 { 227 cur = (low + high) / 2; 228 cmp = wcsncmp(name, cnames[cur].name, len); 229 if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code; 230 if (cmp > 0) low = cur + 1; 231 else high = cur - 1; 232 } 233 return (wchar_t)-1; 234} 235#endif /* BSD_COMPATIBILITY */ 236 237/* Scan the contents of a bracket expression, and create a 238 * tre_bracket_match_list_t encoding the bracket expression. If during 239 * the scan, multi-character collating symbols are detected, switch 240 * into a mode to collect those MCCSs into a tre_collating_symbol 241 * list and pass them back. tre_parse_bracket will use that to 242 * create a new string composed of a union of the bracket expression 243 * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and 244 * call tre_parse (recursive) to parse that new string (which will 245 * call tre_parse_bracket and tre_parse_bracket_items again. */ 246static reg_errcode_t 247tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items, 248 int *items_size, tre_collating_symbol **result) 249{ 250 const tre_char_t *re = ctx->re; 251 const tre_char_t *re_end = ctx->re_end; 252 tre_collating_symbol *col_syms = NULL; 253 tre_collating_symbol *cp = NULL; 254 int n_col_syms = 0; 255 reg_errcode_t status; 256 int max_i = *items_size; 257 int other = 0; /* contains content other than multi-character collating 258 * symbols */ 259 int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */ 260 tre_cint_t min, c; 261 int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE); 262 int collect_MCCS = 0; 263 const tre_char_t *start; 264 265 for ( ;re < re_end; re++) 266 { 267 switch (*re) 268 { 269 case CHAR_MINUS: 270 /* A first hyphen */ 271 if (re == ctx->re) 272 { 273 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 274 min = CHAR_MINUS; 275 other++; 276 range = 0; 277 break; 278 } 279 /* The hyphen is the end range */ 280 if (range > 0) 281 { 282 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 283 c = CHAR_MINUS; 284 goto process_end_range; 285 } 286 if (re + 1 >= re_end) 287 { 288 status = REG_EBRACK; 289 goto error; 290 } 291 /* The hyphen is at the end */ 292 if (re[1] == CHAR_RBRACKET) 293 { 294 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 295 c = CHAR_MINUS; 296 goto process_begin_range; 297 } 298 /* Two ranges are not allowed to share an endpoint, or begin 299 * range is illegal. */ 300 if (range < 0) 301 { 302 status = REG_ERANGE; 303 goto error; 304 } 305 range = 1; /* Expect end range */ 306 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re))); 307 break; 308 309 case CHAR_LBRACKET: 310 if (re + 1 >= re_end) 311 { 312 status = REG_EBRACK; 313 goto error; 314 } 315 switch (re[1]) 316 { 317 case CHAR_PERIOD: 318 { 319 re += 2; 320 start = re; 321 for (;; re++) 322 { 323 if (re >= re_end) 324 { 325 status = REG_ECOLLATE; 326 goto error; 327 } 328 if (*re == CHAR_PERIOD) 329 { 330 if (re + 1 >= re_end) 331 { 332 status = REG_ECOLLATE; 333 goto error; 334 } 335 /* Found end */ 336 if (re[1] == CHAR_RBRACKET) 337 { 338 DPRINT(("tre_parse_bracket: collating " 339 "symbol: '%.*" STRF "'\n", 340 REST(start - 2))); 341 /* Empty name */ 342 if (re == start) 343 { 344 status = REG_ECOLLATE; 345 goto error; 346 } 347#ifdef BSD_COMPATIBILITY 348 /* Check if the name is in cnames; if so, use 349 the corresponding code */ 350 c = tre_search_cnames(start, re - start); 351 if (c != (wchar_t)-1) 352 { 353 re++; 354 goto process_single_character; 355 } 356#endif /* BSD_COMPATIBILITY */ 357 /* Verify this is a known sequence */ 358 if (__collate_equiv_value(ctx->loc, start, 359 re - start) <= 0) 360 { 361 status = REG_ECOLLATE; 362 goto error; 363 } 364 /* Process single character collating symbols */ 365 if (re - start == 1) 366 { 367 c = *start; 368 re++; 369 goto process_single_character; 370 } 371 /* Inverted MCCSs are undefined */ 372 if (invert) 373 { 374 status = REG_ECOLLATE; 375 goto error; 376 } 377 /* Can't have MCCSs as an endpoint to a range */ 378 if (range > 0) 379 { 380 status = REG_ERANGE; 381 goto error; 382 } 383 range = -1; 384 /* Switch into MCCS collection mode (if not 385 * already there */ 386#if TRE_DEBUG 387 if (!collect_MCCS) 388 { 389 collect_MCCS = 1; 390 DPRINT(("tre_parse_bracket: Detected MCCS\n")); 391 } 392#else /* !TRE_DEBUG */ 393 collect_MCCS = 1; 394#endif /* !TRE_DEBUG */ 395 /* Allocate a memory block the first time */ 396 if (!cp) 397 { 398 if ((col_syms = xmalloc(sizeof(*col_syms) * 399 (START_COLLATING_SYMBOLS + 2))) 400 == NULL) 401 return REG_ESPACE; 402 cp = col_syms + 1; 403 n_col_syms = START_COLLATING_SYMBOLS; 404 } 405 /* Enlarge the memory block is more is needed */ 406 if ((cp - col_syms) - 1 >= n_col_syms) 407 { 408 int i = n_col_syms; 409 tre_collating_symbol *tmp = 410 xrealloc(col_syms, sizeof(*col_syms) * 411 ((n_col_syms *= 2) + 2)); 412 if (tmp == NULL) 413 { 414 xfree(col_syms); 415 return REG_ESPACE; 416 } 417 DPRINT(("tre_list_collating_symbols: " 418 "Enlarging col_syms to %d\n", 419 n_col_syms)); 420 col_syms = tmp; 421 cp = col_syms + i + 1; 422 } 423 cp->start = start; 424 cp->len = re - start; 425 cp++; 426 re++; 427 break; 428 } 429 } 430 } 431 break; 432 } 433 434 case CHAR_EQUAL: 435 case CHAR_COLON: 436 { 437 /* Process equivalence and character classes */ 438 tre_char_t kind = re[1]; 439 440 /* Can't have a class as an endpoint to a range */ 441 if (range > 0) 442 { 443 status = REG_ERANGE; 444 goto error; 445 } 446 if (!collect_MCCS && range == 0) 447 { 448 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, 449 min, &max_i, items); 450 if (status != REG_OK) 451 goto error; 452 } 453 range = -1; 454 re += 2; 455 start = re; 456 for (;; re++) 457 { 458 if (re >= re_end) 459 { 460 status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE; 461 goto error; 462 } 463 if (*re == kind) 464 { 465 if (re + 1 >= re_end) 466 { 467 status = kind == CHAR_EQUAL ? REG_ECOLLATE : 468 REG_ECTYPE; 469 goto error; 470 } 471 /* Found end */ 472 if (re[1] == CHAR_RBRACKET) 473 { 474 if (re == start) 475 { 476 /* Empty class name */ 477 status = kind == CHAR_EQUAL ? REG_ECOLLATE : 478 REG_ECTYPE; 479 goto error; 480 } 481 /* Process equivalence class */ 482 if (kind == CHAR_EQUAL) 483 { 484 int equiv; 485 486 DPRINT(("tre_parse_bracket: equivalence: '%.*" 487 STRF "'\n", REST(start - 2))); 488 489 /* While we find the collation value even for 490 multi-character collating elements , we 491 don't (yet) match any collation values 492 against multi-character sequences. We'd have 493 to enumerate those multi-character sequences 494 and like multi-character collating symbols, 495 create a union of those sequences with the 496 rest of the bracket expression. While 497 doable, a bracket expression matching 498 multiple characters, that doesn't explicitly 499 contain multi-character sequences, might 500 be unexpected, so we punt for now. */ 501 if ((equiv = __collate_equiv_value(ctx->loc, 502 start, re - start)) <= 0) 503 { 504 /* The standard says that if no collating 505 element if found, we use the collating 506 symbol itself. But __collate_equiv_value 507 doesn't make a distinction between 508 an element that is in a equvalence 509 class with others, or is the only member, 510 so we already know there is no collating 511 symbol. (Note that in the case of a 512 collating element whose collation value 513 is unique, matching against the 514 collating element itself, or against 515 its collation value, is equivalent.) */ 516#ifdef BSD_COMPATIBILITY 517 /* Check if the name is in cnames; if so, 518 use the corresponding code */ 519 c = tre_search_cnames(start, re - start); 520 if (c != (wchar_t)-1) 521 { 522 re++; 523 goto process_single_character; 524 } 525#endif /* BSD_COMPATIBILITY */ 526 status = REG_ECOLLATE; 527 goto error; 528 } 529 if (!collect_MCCS) 530 { 531 status = tre_new_item(ctx->mem, 532 TRE_BRACKET_MATCH_TYPE_EQUIVALENCE, 533 equiv, &max_i, items); 534 if (status != REG_OK) 535 goto error; 536 } 537 } 538 else 539 { 540 /* Process character class */ 541 DPRINT(("tre_parse_bracket: class: '%.*" STRF 542 "'\n", REST(start - 2))); 543 if (!collect_MCCS) 544 { 545 char tmp_str[64]; 546 tre_ctype_t class; 547 int len = MIN(re - start, 63); 548#ifdef TRE_WCHAR 549 { 550 tre_char_t tmp_wcs[64]; 551 wcsncpy(tmp_wcs, start, (size_t)len); 552 tmp_wcs[len] = L'\0'; 553#if defined HAVE_WCSRTOMBS 554 { 555 mbstate_t state; 556 const tre_char_t *src = tmp_wcs; 557 memset(&state, '\0', sizeof(state)); 558 len = wcsrtombs_l(tmp_str, &src, 559 sizeof(tmp_str), &state, 560 ctx->loc); 561 } 562#elif defined HAVE_WCSTOMBS 563 len = wcstombs(tmp_str, tmp_wcs, 63); 564#endif /* defined HAVE_WCSTOMBS */ 565 } 566#else /* !TRE_WCHAR */ 567 strncpy(tmp_str, (const char*)start, len); 568#endif /* !TRE_WCHAR */ 569 tmp_str[len] = '\0'; 570 DPRINT((" class name: %s\n", tmp_str)); 571 class = tre_ctype_l(tmp_str, ctx->loc); 572 if (!class) 573 { 574 status = REG_ECTYPE; 575 goto error; 576 } 577 status = tre_new_item(ctx->mem, 578 TRE_BRACKET_MATCH_TYPE_CLASS, 579 class, &max_i, items); 580 if (status != REG_OK) 581 goto error; 582 } 583 } 584 re++; 585 break; 586 } 587 } 588 } 589 other++; 590 break; 591 } 592 593 default: 594 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 595 c = CHAR_LBRACKET; 596 goto process_single_character; 597 break; 598 } 599 break; 600 601 case CHAR_RBRACKET: 602 /* A first right bracket */ 603 if (re == ctx->re) 604 { 605 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 606 min = CHAR_RBRACKET; 607 range = 0; 608 other++; 609 break; 610 } 611 /* Done */ 612 if (collect_MCCS) 613 { 614 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", 615 REST(re))); 616 if (col_syms) 617 { 618 /* Mark the character following the right bracket. Set len 619 * to whether there are other things besides the 620 * multi-character collating symbols */ 621 col_syms->start = re + 1; 622 col_syms->len = other; 623 /* Mark the end of the list */ 624 cp->start = NULL; 625 } 626 *result = col_syms; 627 return REG_OK; 628 } 629 /* range > 0 is not possible, since we did a lookahead after the 630 * hyphen */ 631 if (range == 0) 632 { 633 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, 634 min, &max_i, items); 635 if (status != REG_OK) 636 goto error; 637 } 638 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); 639 *items_size = max_i; 640 ctx->re = re + 1; 641 return REG_OK; 642 643 default: 644 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 645 c = *re; 646process_single_character: 647 /* Process single character */ 648 if (range > 0) 649 { 650 int mine, maxe; 651 652process_end_range: 653 /* Get collation equivalence values */ 654 mine = __collate_equiv_value(ctx->loc, &min, 1); 655 maxe = __collate_equiv_value(ctx->loc, &c, 1); 656 if (maxe < mine) 657 { 658 status = REG_ERANGE; 659 goto error; 660 } 661 if (!collect_MCCS) 662 { 663 status = tre_new_item(ctx->mem, 664 TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN, 665 mine, &max_i, items); 666 if (status != REG_OK) 667 goto error; 668 status = tre_new_item(ctx->mem, 669 TRE_BRACKET_MATCH_TYPE_RANGE_END, 670 maxe, &max_i, items); 671 if (status != REG_OK) 672 goto error; 673 } 674 range = -1; 675 } 676 else 677 { 678process_begin_range: 679 if (!collect_MCCS) 680 { 681 if (range == 0) 682 { 683 status = tre_new_item(ctx->mem, 684 TRE_BRACKET_MATCH_TYPE_CHAR, 685 min, &max_i, items); 686 if (status != REG_OK) 687 goto error; 688 } 689 min = c; 690 } 691 range = 0; 692 } 693 other++; 694 break; 695 } 696 } 697 status = REG_EBRACK; 698error: 699 DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n", 700 REST(re), status)); 701 if (col_syms) 702 xfree(col_syms); 703 return status; 704} 705 706#ifdef TRE_DEBUG 707static const char *bracket_match_type_str[] = { 708 "unused", 709 "char", 710 "range begin", 711 "range end", 712 "class", 713 "equivalence value", 714}; 715#endif /* TRE_DEBUG */ 716 717static reg_errcode_t 718tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) 719{ 720 tre_ast_node_t *node; 721 reg_errcode_t status = REG_OK; 722 tre_bracket_match_list_t *items; 723 int max_i = 32; 724 tre_collating_symbol *col_syms = NULL; 725 726 /* Handle special cases [[:<:]] and [[:>:]] */ 727 if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET 728 && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>') 729 && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET 730 && ctx->re[5] == CHAR_RBRACKET) 731 { 732 *result = tre_ast_new_literal(ctx->mem, ASSERTION, 733 (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW, 734 -1); 735 DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ? 736 "[[:<:]]" : "[[:>:]]")); 737 ctx->re += 6; 738 return *result ? REG_OK : REG_ESPACE; 739 } 740 741 /* Start off with an array of `max_i' elements. */ 742 items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i)); 743 if (items == NULL) 744 return REG_ESPACE; 745 746 if (*ctx->re == CHAR_CARET) 747 { 748 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re))); 749 items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE; 750 ctx->re++; 751 } 752 753 status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms); 754 755 if (status != REG_OK) 756 goto parse_bracket_done; 757 758 /* If there are collating symbols, split off the multi-character ones 759 * into a union of the bracket expression (without the collating symbols) 760 * and the multiple-character sequences. We create an equivalent input 761 * string and run tre_parse() recursively */ 762 if (col_syms) 763 { 764 tre_char_t *str, *sp; 765 tre_collating_symbol *cp; 766 tre_parse_ctx_t subctx; 767 768 /* Allocate a new string. We start with the size of the original 769 * bracket expression (minus 1) and add 2 (for a leading "[" and 770 * a trailing nil; don't need a "^", since it is illegal to have 771 * inverted MCCSs). Since a multi-character collating symbols 772 * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't 773 * need to worry about the new string getting too long. */ 774 xfree(items); 775 str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2)); 776 if (str == NULL) 777 { 778 xfree(col_syms); 779 return REG_ESPACE; 780 } 781 sp = str; 782 if (col_syms->len > 0) 783 { 784 /* There are other items in the bracket expression besides the 785 * multi-character collating symbols, so create a new bracket 786 * expression with only those other itmes. */ 787 const tre_char_t *re; 788 ptrdiff_t i; 789 790 *sp++ = '['; 791 re = ctx->re; 792 for (cp = col_syms + 1; cp->start; cp++) 793 { 794 /* The "- 2" is to account for the "[." */ 795 if ((i = ((cp->start - re) - 2)) > 0) 796 { 797 memcpy(sp, re, sizeof(*sp) * i); 798 sp += i; 799 } 800 /* The "+ 2" is to account for the ".]" */ 801 re = cp->start + cp->len + 2; 802 } 803 i = col_syms->start - re; /* Includes the trailing right bracket */ 804 memcpy(sp, re, sizeof(*sp) * i); 805 sp += i; 806 *sp++ = '|'; 807 } 808 for (cp = col_syms + 1; cp->start; cp++) 809 { 810 memcpy(sp, cp->start, sizeof(*sp) * cp->len); 811 sp += cp->len; 812 if (cp[1].start) 813 *sp++ = '|'; 814 } 815 *sp = 0; 816 DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n", 817 str)); 818 819 memcpy(&subctx, ctx, sizeof(subctx)); 820 subctx.re = str; 821 subctx.len = sp - str; 822 subctx.nofirstsub = 1; 823 subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */ 824 status = tre_parse(&subctx); 825 xfree(str); 826 if (status != REG_OK) 827 { 828 xfree(col_syms); 829 return status; 830 } 831 ctx->re = col_syms->start; 832 ctx->position = subctx.position; 833 xfree(col_syms); 834 *result = subctx.result; 835 DPRINT(("tre_parse_bracket: Returning to original string\n")); 836 return REG_OK; 837 } 838 839 DPRINT(("tre_parse_bracket: creating bracket expression literal\n")); 840 node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position); 841 if (node == NULL) 842 { 843 status = REG_ESPACE; 844 goto parse_bracket_done; 845 } 846 else 847 { 848 tre_literal_t *l = node->obj; 849 l->u.bracket_match_list = tre_mem_alloc(ctx->mem, 850 SIZEOF_BRACKET_MATCH_LIST(items)); 851 if (l->u.bracket_match_list == NULL) 852 { 853 status = REG_ESPACE; 854 goto parse_bracket_done; 855 } 856 memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items)); 857 } 858 859#ifdef TRE_DEBUG 860 { 861 int i; 862 tre_bracket_match_t *b; 863 DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n", 864 items->num_bracket_matches, items->flags)); 865 for (i = 0, b = items->bracket_matches; 866 i < items->num_bracket_matches; i++, b++) 867 { 868 DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type], 869 b->value)); 870 } 871 } 872#endif /* TRE_DEBUG */ 873 874 parse_bracket_done: 875 xfree(items); 876 ctx->position++; 877 *result = node; 878 return status; 879} 880 881 882/* Parses a positive decimal integer. Returns -1 if the string does not 883 contain a valid number. */ 884static int 885tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) 886{ 887 int num = -1; 888 const tre_char_t *r = *regex; 889 while (r < regex_end && *r >= L'0' && *r <= L'9') 890 { 891 if (num < 0) 892 num = 0; 893 num = num * 10 + *r - L'0'; 894 r++; 895 } 896 *regex = r; 897 return num; 898} 899 900 901static reg_errcode_t 902tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) 903{ 904 int min, max; 905#ifdef TRE_APPROX 906 int i; 907 int cost_ins, cost_del, cost_subst, cost_max; 908 int limit_ins, limit_del, limit_subst, limit_err; 909 const tre_char_t *start; 910#endif /* TRE_APPROX */ 911 const tre_char_t *r = ctx->re; 912 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; 913#ifdef TRE_APPROX 914 int approx = 0; 915 int costs_set = 0; 916 int counts_set = 0; 917 918 cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET; 919 limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET; 920#endif /* TRE_APPROX */ 921 922 /* Parse number (minimum repetition count). */ 923 min = -1; 924 if (r >= ctx->re_end) 925#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 926 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE; 927#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 928 return REG_EBRACE; 929#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 930 if (*r >= L'0' && *r <= L'9') { 931 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r))); 932 min = tre_parse_int(&r, ctx->re_end); 933 } 934#ifndef TRE_APPROX 935 else 936#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 937 /* For ERE, return REG_NOMATCH to signal that the lbrace should 938 be treated as a literal */ 939 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR; 940#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 941 return REG_BADBR; 942#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 943#endif /* !TRE_APPROX */ 944 945 /* Parse comma and second number (maximum repetition count). */ 946 max = min; 947 if (r < ctx->re_end && *r == CHAR_COMMA) 948 { 949 r++; 950 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r))); 951 max = tre_parse_int(&r, ctx->re_end); 952 } 953 954 /* Check that the repeat counts are sane. */ 955 if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX) 956 return REG_BADBR; 957 958 959#ifdef TRE_APPROX 960 /* 961 '{' 962 optionally followed immediately by a number == minimum repcount 963 optionally followed by , then a number == maximum repcount 964 + then a number == maximum insertion count 965 - then a number == maximum deletion count 966 # then a number == maximum substitution count 967 ~ then a number == maximum number of errors 968 Any of +, -, # or ~ without followed by a number means that 969 the maximum count/number of errors is infinite. 970 971 An equation of the form 972 Xi + Yd + Zs < C 973 can be specified to set costs and the cost limit to a value 974 different from the default value: 975 - X is the cost of an insertion 976 - Y is the cost of a deletion 977 - Z is the cost of a substitution 978 - C is the maximum cost 979 980 If no count limit or cost is set for an operation, the operation 981 is not allowed at all. 982 */ 983 984 985 do { 986 int done; 987 start = r; 988 989 /* Parse count limit settings */ 990 done = 0; 991 if (!counts_set) 992 while (r + 1 < ctx->re_end && !done) 993 { 994 switch (*r) 995 { 996 case CHAR_PLUS: /* Insert limit */ 997 DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r))); 998 r++; 999 limit_ins = tre_parse_int(&r, ctx->re_end); 1000 if (limit_ins < 0) 1001 limit_ins = INT_MAX; 1002 counts_set = 1; 1003 break; 1004 case CHAR_MINUS: /* Delete limit */ 1005 DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r))); 1006 r++; 1007 limit_del = tre_parse_int(&r, ctx->re_end); 1008 if (limit_del < 0) 1009 limit_del = INT_MAX; 1010 counts_set = 1; 1011 break; 1012 case CHAR_HASH: /* Substitute limit */ 1013 DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r))); 1014 r++; 1015 limit_subst = tre_parse_int(&r, ctx->re_end); 1016 if (limit_subst < 0) 1017 limit_subst = INT_MAX; 1018 counts_set = 1; 1019 break; 1020 case CHAR_TILDE: /* Maximum number of changes */ 1021 DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r))); 1022 r++; 1023 limit_err = tre_parse_int(&r, ctx->re_end); 1024 if (limit_err < 0) 1025 limit_err = INT_MAX; 1026 approx = 1; 1027 break; 1028 case CHAR_COMMA: 1029 r++; 1030 break; 1031 case L' ': 1032 r++; 1033 break; 1034 case L'}': 1035 done = 1; 1036 break; 1037 default: 1038 done = 1; 1039 break; 1040 } 1041 } 1042 1043 /* Parse cost restriction equation. */ 1044 done = 0; 1045 if (!costs_set) 1046 while (r + 1 < ctx->re_end && !done) 1047 { 1048 switch (*r) 1049 { 1050 case CHAR_PLUS: 1051 case L' ': 1052 r++; 1053 break; 1054 case L'<': 1055 DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r))); 1056 r++; 1057 while (*r == L' ') 1058 r++; 1059 cost_max = tre_parse_int(&r, ctx->re_end); 1060 if (cost_max < 0) 1061 cost_max = INT_MAX; 1062 else 1063 cost_max--; 1064 approx = 1; 1065 break; 1066 case CHAR_COMMA: 1067 r++; 1068 done = 1; 1069 break; 1070 default: 1071 if (*r >= L'0' && *r <= L'9') 1072 { 1073#ifdef TRE_DEBUG 1074 const tre_char_t *sr = r; 1075#endif /* TRE_DEBUG */ 1076 int cost = tre_parse_int(&r, ctx->re_end); 1077 /* XXX - make sure r is not past end. */ 1078 switch (*r) 1079 { 1080 case L'i': /* Insert cost */ 1081 DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n", 1082 REST(sr))); 1083 r++; 1084 cost_ins = cost; 1085 costs_set = 1; 1086 break; 1087 case L'd': /* Delete cost */ 1088 DPRINT(("tre_parse: del cost: '%.*" STRF "'\n", 1089 REST(sr))); 1090 r++; 1091 cost_del = cost; 1092 costs_set = 1; 1093 break; 1094 case L's': /* Substitute cost */ 1095 DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n", 1096 REST(sr))); 1097 r++; 1098 cost_subst = cost; 1099 costs_set = 1; 1100 break; 1101 default: 1102 return REG_BADBR; 1103 } 1104 } 1105 else 1106 { 1107 done = 1; 1108 break; 1109 } 1110 } 1111 } 1112 } while (start != r); 1113#endif /* TRE_APPROX */ 1114 1115 /*{*//* Missing }. */ 1116 if (r >= ctx->re_end) 1117 return REG_EBRACE; 1118 1119 /* Empty contents of {}. */ 1120 if (r == ctx->re) 1121 return REG_BADBR; 1122 1123 /* Parse the ending '}' or '\}'.*/ 1124 if (ctx->cflags & REG_EXTENDED) 1125 { 1126 if (r >= ctx->re_end || *r != CHAR_RBRACE) 1127 return REG_BADBR; 1128 r++; 1129 /* Parse trailing '?' marking minimal repetition. */ 1130 if (r < ctx->re_end) 1131 { 1132 if (*r == CHAR_QUESTIONMARK) 1133 { 1134 /* Process the question mark only in enhanced mode. 1135 Otherwise, the question mark is an error in ERE 1136 or a literal in BRE */ 1137 if (ctx->cflags & REG_ENHANCED) 1138 { 1139 minimal = !(ctx->cflags & REG_UNGREEDY); 1140 r++; 1141 } 1142 else return REG_BADRPT; 1143 } 1144 else if (*r == CHAR_STAR || *r == CHAR_PLUS) 1145 { 1146 /* These are reserved for future extensions. */ 1147 return REG_BADRPT; 1148 } 1149 } 1150 } 1151 else 1152 { 1153 if (r + 1 >= ctx->re_end 1154 || *r != CHAR_BACKSLASH 1155 || *(r + 1) != CHAR_RBRACE) 1156 return REG_BADBR; 1157 r += 2; 1158 if (r < ctx->re_end && *r == CHAR_STAR) 1159 { 1160 /* This is reserved for future extensions. */ 1161 return REG_BADRPT; 1162 } 1163 } 1164 1165 if (minimal) 1166 ctx->num_reorder_tags++; 1167 1168 if (!result) goto parse_bound_exit; 1169 /* Create the AST node(s). */ 1170 /* Originally, if min == 0 && max == 0, we immediately replace the whole 1171 iteration with EMPTY. This unfortunately drops any submatches, and 1172 messes up setting the pmatch values (we can get tags of -1, and 1173 tag values in the billions). So we leave it and process this case as 1174 usual, and wait until tre_expand_ast() to replace with EMPTY */ 1175#ifdef TRE_APPROX 1176 if (min < 0 && max < 0) 1177 /* Only approximate parameters set, no repetitions. */ 1178 min = max = 1; 1179#endif /* TRE_APPROX */ 1180 1181 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); 1182 if (!*result) 1183 return REG_ESPACE; 1184 1185#ifdef TRE_APPROX 1186 /* If approximate matching parameters are set, add them to the 1187 iteration node. */ 1188 if (approx || costs_set || counts_set) 1189 { 1190 int *params; 1191 tre_iteration_t *iter = (*result)->obj; 1192 1193 if (costs_set || counts_set) 1194 { 1195 if (limit_ins == TRE_PARAM_UNSET) 1196 { 1197 if (cost_ins == TRE_PARAM_UNSET) 1198 limit_ins = 0; 1199 else 1200 limit_ins = INT_MAX; 1201 } 1202 1203 if (limit_del == TRE_PARAM_UNSET) 1204 { 1205 if (cost_del == TRE_PARAM_UNSET) 1206 limit_del = 0; 1207 else 1208 limit_del = INT_MAX; 1209 } 1210 1211 if (limit_subst == TRE_PARAM_UNSET) 1212 { 1213 if (cost_subst == TRE_PARAM_UNSET) 1214 limit_subst = 0; 1215 else 1216 limit_subst = INT_MAX; 1217 } 1218 } 1219 1220 if (cost_max == TRE_PARAM_UNSET) 1221 cost_max = INT_MAX; 1222 if (limit_err == TRE_PARAM_UNSET) 1223 limit_err = INT_MAX; 1224 1225 ctx->have_approx = 1; 1226 params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST); 1227 if (!params) 1228 return REG_ESPACE; 1229 for (i = 0; i < TRE_PARAM_LAST; i++) 1230 params[i] = TRE_PARAM_UNSET; 1231 params[TRE_PARAM_COST_INS] = cost_ins; 1232 params[TRE_PARAM_COST_DEL] = cost_del; 1233 params[TRE_PARAM_COST_SUBST] = cost_subst; 1234 params[TRE_PARAM_COST_MAX] = cost_max; 1235 params[TRE_PARAM_MAX_INS] = limit_ins; 1236 params[TRE_PARAM_MAX_DEL] = limit_del; 1237 params[TRE_PARAM_MAX_SUBST] = limit_subst; 1238 params[TRE_PARAM_MAX_ERR] = limit_err; 1239 iter->params = params; 1240 } 1241#endif /* TRE_APPROX */ 1242 1243parse_bound_exit: 1244#ifdef TRE_APPROX 1245 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], " 1246 "limits [%d,%d,%d, total %d]\n", 1247 min, max, cost_ins, cost_del, cost_subst, cost_max, 1248 limit_ins, limit_del, limit_subst, limit_err)); 1249#else /* !TRE_APPROX */ 1250 DPRINT(("tre_parse_bound: min %d, max %d\n", min, max)); 1251#endif /* !TRE_APPROX */ 1252 1253 1254 ctx->re = r; 1255 return REG_OK; 1256} 1257 1258/* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for 1259 non-self-contained options, like (?i), this causes ((?i)fu)bar to be 1260 treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect. 1261 Because we now set up tags for even non-capturing parenthesized 1262 subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we 1263 pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and 1264 have it restore cflags after the subexpression, we don't need to have 1265 a separate PARSE_RESTORE_CFLAGS, and then after processing the 1266 non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE. 1267 This has the side-benefit of now matching the perl behavior: the RE 1268 foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior 1269 of foo AND (?i) (bar OR zap). */ 1270typedef enum { 1271 PARSE_RE = 0, 1272 PARSE_ATOM, 1273 PARSE_MARK_FOR_SUBMATCH, 1274 PARSE_BRANCH, 1275 PARSE_PIECE, 1276 PARSE_CATENATION, 1277 PARSE_POST_CATENATION, 1278 PARSE_UNION, 1279 PARSE_POST_UNION, 1280 PARSE_POSTFIX, 1281} tre_parse_re_stack_symbol_t; 1282 1283 1284reg_errcode_t 1285tre_parse(tre_parse_ctx_t *ctx) 1286{ 1287 tre_ast_node_t *result = NULL; 1288 tre_parse_re_stack_symbol_t symbol; 1289 reg_errcode_t status = REG_OK; 1290 tre_stack_t *stack = ctx->stack; 1291 int bottom = tre_stack_num_objects(stack); 1292 int depth = 0; 1293 int temporary_cflags = 0; 1294 int bre_branch_begin; 1295#ifdef TRE_DEBUG 1296 const tre_char_t *tmp_re; 1297#endif 1298 1299 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n", 1300 ctx->len, ctx->re, ctx->len, ctx->cflags)); 1301 1302 if (ctx->len <= 0) return REG_EMPTY; 1303 if (!ctx->nofirstsub) 1304 { 1305 STACK_PUSH(stack, int, ctx->cflags); 1306 STACK_PUSH(stack, int, ctx->submatch_id); 1307 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); 1308 ctx->submatch_id++; 1309 } 1310 STACK_PUSH(stack, int, 0); // bre_branch_begin 1311 STACK_PUSH(stack, int, PARSE_RE); 1312 ctx->re_start = ctx->re; 1313 ctx->re_end = ctx->re + ctx->len; 1314 1315 1316 /* The following is basically just a recursive descent parser. I use 1317 an explicit stack instead of recursive functions mostly because of 1318 two reasons: compatibility with systems which have an overflowable 1319 call stack, and efficiency (both in lines of code and speed). */ 1320 while (tre_stack_num_objects(stack) > bottom) 1321 { 1322 symbol = tre_stack_pop_int(stack); 1323 switch (symbol) 1324 { 1325 case PARSE_RE: 1326 /* Parse a full regexp. A regexp is one or more branches, 1327 separated by the union operator `|'. */ 1328 bre_branch_begin = tre_stack_pop_int(stack); 1329 if ( 1330#ifdef REG_LITERAL 1331 !(ctx->cflags & REG_LITERAL) && 1332#endif /* REG_LITERAL */ 1333 ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) 1334 STACK_PUSHX(stack, int, PARSE_UNION); 1335 STACK_PUSHX(stack, int, bre_branch_begin); 1336 STACK_PUSHX(stack, int, PARSE_BRANCH); 1337 break; 1338 1339 case PARSE_BRANCH: 1340 /* Parse a branch. A branch is one or more pieces, concatenated. 1341 A piece is an atom possibly followed by a postfix operator. */ 1342 bre_branch_begin = tre_stack_pop_int(stack); 1343 STACK_PUSHX(stack, int, PARSE_CATENATION); 1344 STACK_PUSHX(stack, int, bre_branch_begin); 1345 STACK_PUSHX(stack, int, PARSE_PIECE); 1346 break; 1347 1348 case PARSE_PIECE: 1349 /* Parse a piece. A piece is an atom possibly followed by one 1350 or more postfix operators. */ 1351 bre_branch_begin = tre_stack_pop_int(stack); 1352 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1353 STACK_PUSHX(stack, int, bre_branch_begin); 1354 STACK_PUSHX(stack, int, PARSE_ATOM); 1355 break; 1356 1357 case PARSE_CATENATION: 1358 /* If the expression has not ended, parse another piece. */ 1359 { 1360 tre_char_t c; 1361 if (ctx->re >= ctx->re_end) 1362 break; 1363 c = *ctx->re; 1364#ifdef REG_LITERAL 1365 if (!(ctx->cflags & REG_LITERAL)) 1366 { 1367#endif /* REG_LITERAL */ 1368 if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) || 1369 ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED 1370 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH && 1371 *(ctx->re + 1) == CHAR_PIPE)) 1372 break; 1373 if ((ctx->cflags & REG_EXTENDED 1374 && c == CHAR_RPAREN && depth > 0) 1375 || (!(ctx->cflags & REG_EXTENDED) 1376 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH 1377 && *(ctx->re + 1) == CHAR_RPAREN)) 1378 { 1379 if (!(ctx->cflags & REG_EXTENDED) && depth == 0) 1380 return REG_EPAREN; 1381 DPRINT(("tre_parse: group end: '%.*" STRF "'\n", 1382 REST(ctx->re))); 1383 depth--; 1384 if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED))) 1385 ctx->re += 2; 1386 break; 1387 } 1388#ifdef REG_LITERAL 1389 } 1390#endif /* REG_LITERAL */ 1391 1392#ifdef REG_LEFT_ASSOC 1393 if (ctx->cflags & REG_LEFT_ASSOC) 1394 { 1395 /* Left associative concatenation. */ 1396 STACK_PUSHX(stack, int, PARSE_CATENATION); 1397 STACK_PUSHX(stack, voidptr, result); 1398 STACK_PUSHX(stack, int, PARSE_POST_CATENATION); 1399 STACK_PUSHX(stack, int, 0); // bre_branch_begin 1400 STACK_PUSHX(stack, int, PARSE_PIECE); 1401 } 1402 else 1403#endif /* REG_LEFT_ASSOC */ 1404 { 1405 /* Default case, right associative concatenation. */ 1406 STACK_PUSHX(stack, voidptr, result); 1407 STACK_PUSHX(stack, int, PARSE_POST_CATENATION); 1408 STACK_PUSHX(stack, int, PARSE_CATENATION); 1409 STACK_PUSHX(stack, int, 0); // bre_branch_begin 1410 STACK_PUSHX(stack, int, PARSE_PIECE); 1411 } 1412 break; 1413 } 1414 1415 case PARSE_POST_CATENATION: 1416 { 1417 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); 1418 tre_ast_node_t *tmp_node; 1419 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); 1420 if (!tmp_node) 1421 return REG_ESPACE; 1422 result = tmp_node; 1423 break; 1424 } 1425 1426 case PARSE_UNION: 1427 if (ctx->re >= ctx->re_end) 1428 break; 1429#ifdef REG_LITERAL 1430 if (ctx->cflags & REG_LITERAL) 1431 break; 1432#endif /* REG_LITERAL */ 1433 if (!(ctx->cflags & REG_EXTENDED)) 1434 { 1435 if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end) 1436 break; 1437 ctx->re++; 1438 } 1439 switch (*ctx->re) 1440 { 1441 case CHAR_PIPE: 1442 DPRINT(("tre_parse: union: '%.*" STRF "'\n", 1443 REST(ctx->re))); 1444 STACK_PUSHX(stack, int, PARSE_UNION); 1445 STACK_PUSHX(stack, voidptr, (void *)ctx->re); 1446 STACK_PUSHX(stack, voidptr, result); 1447 STACK_PUSHX(stack, int, PARSE_POST_UNION); 1448 /* We need to pass a boolean (eventually) to PARSE_ATOM to 1449 indicate if this is the beginning of a BRE extended branch. */ 1450 STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin 1451 STACK_PUSHX(stack, int, PARSE_BRANCH); 1452 ctx->re++; 1453 break; 1454 1455 case CHAR_RPAREN: 1456 ctx->re++; 1457 break; 1458 1459 default: 1460 if (!(ctx->cflags & REG_EXTENDED)) 1461 ctx->re--; 1462 break; 1463 } 1464 break; 1465 1466 case PARSE_POST_UNION: 1467 { 1468 tre_ast_node_t *tmp_node; 1469 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); 1470 const tre_char_t *pipechar = tre_stack_pop_voidptr(stack); 1471 /* error on empty expression at end of union */ 1472 if (pipechar == ctx->re - 1) 1473 { 1474 return REG_EMPTY; 1475 } 1476 tmp_node = tre_ast_new_union(ctx->mem, tree, result); 1477 if (!tmp_node) 1478 return REG_ESPACE; 1479 result = tmp_node; 1480 break; 1481 } 1482 1483 case PARSE_POSTFIX: 1484 /* Parse postfix operators. */ 1485 if (ctx->re >= ctx->re_end) 1486 break; 1487#ifdef REG_LITERAL 1488 if (ctx->cflags & REG_LITERAL) 1489 break; 1490#endif /* REG_LITERAL */ 1491 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; 1492 int rep_min = 0; 1493 int rep_max = -1; 1494#ifdef TRE_DEBUG 1495 int lbrace_off; 1496#endif 1497 switch (*ctx->re) 1498 { 1499 case CHAR_PLUS: 1500 case CHAR_QUESTIONMARK: 1501 if (!(ctx->cflags & REG_EXTENDED)) 1502 break; 1503 /*FALLTHROUGH*/ 1504 case CHAR_STAR: 1505 { 1506 tre_ast_node_t *tmp_node; 1507#ifdef TRE_DEBUG 1508 const char *tstr = "star"; 1509 tmp_re = ctx->re; 1510#endif 1511 1512 handle_plus_or_question: 1513 /* error on iteration of raw assertion (not in subexpression) */ 1514 if (result->type == LITERAL && result->submatch_id < 0 && 1515 IS_ASSERTION((tre_literal_t *)result->obj)) 1516 { 1517 if (!(ctx->cflags & REG_EXTENDED)) break; 1518 return REG_BADRPT; 1519 } 1520 if (*ctx->re == CHAR_PLUS) 1521 { 1522 rep_min = 1; 1523#ifdef TRE_DEBUG 1524 tstr = "plus"; 1525#endif 1526 } 1527 if (*ctx->re == CHAR_QUESTIONMARK) 1528 { 1529 rep_max = 1; 1530#ifdef TRE_DEBUG 1531 tstr = "questionmark"; 1532#endif 1533 } 1534 1535 if (ctx->cflags & REG_EXTENDED) 1536 { 1537 if (ctx->re + 1 < ctx->re_end) 1538 { 1539 if (*(ctx->re + 1) == CHAR_QUESTIONMARK) 1540 { 1541 /* Process the question mark only in enhanced mode. 1542 Otherwise, the question mark is an error in ERE */ 1543 if (ctx->cflags & REG_ENHANCED) 1544 { 1545 minimal = !(ctx->cflags & REG_UNGREEDY); 1546 ctx->re++; 1547 } 1548 else return REG_BADRPT; 1549 } 1550 else if (*(ctx->re + 1) == CHAR_STAR 1551 || *(ctx->re + 1) == CHAR_PLUS) 1552 { 1553 /* These are reserved for future extensions. */ 1554 return REG_BADRPT; 1555 } 1556 } 1557 } 1558 else 1559 { 1560 if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR) 1561 { 1562 /* This is reserved for future extensions. */ 1563 return REG_BADRPT; 1564 } 1565 if (ctx->re + 2 < ctx->re_end) 1566 { 1567 if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_QUESTIONMARK) 1568 { 1569 /* Process the question mark only in enhanced mode. 1570 Otherwise, the question mark is a literal in BRE */ 1571 if (ctx->cflags & REG_ENHANCED) 1572 { 1573 minimal = !(ctx->cflags & REG_UNGREEDY); 1574 ctx->re += 2; 1575 } 1576 } 1577 else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS) 1578 { 1579 /* This is reserved for future extensions. */ 1580 return REG_BADRPT; 1581 } 1582 } 1583 } 1584 1585 if (minimal) 1586 ctx->num_reorder_tags++; 1587 1588 DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n", 1589 minimal ? " minimal" : "greedy", tstr, REST(tmp_re))); 1590 if (result == NULL) 1591 { 1592 if (ctx->cflags & REG_EXTENDED) return REG_BADRPT; 1593 else goto parse_literal; 1594 } 1595 ctx->re++; 1596 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, 1597 minimal); 1598 if (tmp_node == NULL) 1599 return REG_ESPACE; 1600 result = tmp_node; 1601 1602 /* Set the iterator with a submatch id in the invisible range 1603 * (which will be overridden if a real submatch is needed) */ 1604 result->submatch_id = ctx->submatch_id_invisible++; 1605 1606#if 0 1607 /* We don't allow multiple postfixes, but this might be needed 1608 to support approximate matching */ 1609 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1610#endif 1611 } 1612 break; 1613 1614 case CHAR_BACKSLASH: 1615 /* "\{" is special without REG_EXTENDED */ 1616 /* "\+" and "\?" are special with REG_ENHANCED for BRE */ 1617 if (!(ctx->cflags & REG_EXTENDED) 1618 && ctx->re + 1 < ctx->re_end) 1619 { 1620 switch (*(ctx->re + 1)) 1621 { 1622 case CHAR_LBRACE: 1623 ctx->re++; 1624#ifdef TRE_DEBUG 1625 lbrace_off = 2; 1626#endif 1627 goto parse_brace; 1628 case CHAR_PLUS: 1629 case CHAR_QUESTIONMARK: 1630 if (ctx->cflags & REG_ENHANCED) 1631 { 1632#ifdef TRE_DEBUG 1633 tmp_re = ctx->re; 1634#endif 1635 ctx->re++; 1636 goto handle_plus_or_question; 1637 } 1638 break; 1639 } 1640 break; 1641 } 1642 else 1643 break; 1644 1645 case CHAR_LBRACE: 1646 { 1647 int raw_assertion; 1648 1649 /* "{" is literal without REG_EXTENDED */ 1650 if (!(ctx->cflags & REG_EXTENDED)) 1651 break; 1652#ifdef TRE_DEBUG 1653 lbrace_off = 1; 1654#endif 1655 1656 parse_brace: 1657 /* error on iteration of raw assertion (not in subexpression), 1658 but wait until after parsing bounds */ 1659 raw_assertion = (result->type == LITERAL 1660 && result->submatch_id < 0 1661 && IS_ASSERTION((tre_literal_t *)result->obj)); 1662 ctx->re++; 1663 1664 status = tre_parse_bound(ctx, &result); 1665#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 1666 /* For ERE, if status is REG_NOMATCH, this mean the lbrace 1667 is to be treated as a literal. */ 1668 if (status == REG_NOMATCH) 1669 { 1670 ctx->re--; 1671 break; 1672 } 1673#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 1674 DPRINT(("tre_parse: bound: '%.*" STRF "'\n", 1675 REST(ctx->re - lbrace_off))); 1676 if (status != REG_OK) 1677 return status; 1678 if (raw_assertion) return REG_BADRPT; 1679 1680 /* Set the iterator with a submatch id in the invisible range 1681 * (which will be overridden if a real submatch is needed) */ 1682 if (result->type == ITERATION) 1683 result->submatch_id = ctx->submatch_id_invisible++; 1684 1685#if 0 1686 /* We don't allow multiple postfixes, but this might be needed 1687 to support approximate matching */ 1688 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1689#endif 1690 break; 1691 } 1692 } 1693 break; 1694 1695 case PARSE_ATOM: 1696 { 1697 /* Parse an atom. An atom is a regular expression enclosed in `()', 1698 an empty set of `()', a bracket expression, `.', `^', `$', 1699 a `\' followed by a character, or a single character. */ 1700 1701 /* The stack contains a boolean value, whether PARSE_ATOM is 1702 being called just after the start of a group (left paren) 1703 in a BRE */ 1704 bre_branch_begin = tre_stack_pop_int(stack); 1705 1706 /* End of regexp? (empty string). */ 1707 if (ctx->re >= ctx->re_end) 1708 goto parse_literal; 1709 1710#ifdef REG_LITERAL 1711 if (ctx->cflags & REG_LITERAL) 1712 goto parse_literal; 1713#endif /* REG_LITERAL */ 1714 1715 switch (*ctx->re) 1716 { 1717 case CHAR_LPAREN: /* parenthesized subexpression */ 1718 1719 /* Handle "(?...)" extensions. They work in a way similar 1720 to Perls corresponding extensions. */ 1721 if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) == 1722 (REG_EXTENDED|REG_ENHANCED) 1723 && *(ctx->re + 1) == CHAR_QUESTIONMARK) 1724 { 1725 int new_cflags = ctx->cflags; 1726 int bit = 1; 1727 int invisible_submatch = 0; 1728 DPRINT(("tre_parse: extension: '%.*" STRF "'\n", 1729 REST(ctx->re))); 1730 ctx->re += 2; 1731 while (/*CONSTCOND*/1) 1732 { 1733 if (*ctx->re == L'i') 1734 { 1735 DPRINT(("tre_parse: icase: '%.*" STRF "'\n", 1736 REST(ctx->re))); 1737 if (bit) 1738 new_cflags |= REG_ICASE; 1739 else 1740 new_cflags &= ~REG_ICASE; 1741 ctx->re++; 1742 } 1743 else if (*ctx->re == L'n') 1744 { 1745 DPRINT(("tre_parse: newline: '%.*" STRF "'\n", 1746 REST(ctx->re))); 1747 if (bit) 1748 new_cflags |= REG_NEWLINE; 1749 else 1750 new_cflags &= ~REG_NEWLINE; 1751 ctx->re++; 1752 } 1753#ifdef REG_LEFT_ASSOC 1754 else if (*ctx->re == L'l') 1755 { 1756 DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n", 1757 REST(ctx->re))); 1758 if (bit) 1759 new_cflags |= REG_LEFT_ASSOC; 1760 else 1761 new_cflags &= ~REG_LEFT_ASSOC; 1762 ctx->re++; 1763 } 1764#endif /* REG_LEFT_ASSOC */ 1765#ifdef REG_UNGREEDY 1766 else if (*ctx->re == L'U') 1767 { 1768 DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n", 1769 REST(ctx->re))); 1770 if (bit) 1771 new_cflags |= REG_UNGREEDY; 1772 else 1773 new_cflags &= ~REG_UNGREEDY; 1774 ctx->re++; 1775 } 1776#endif /* REG_UNGREEDY */ 1777 else if (*ctx->re == CHAR_MINUS) 1778 { 1779 DPRINT(("tre_parse: turn off: '%.*" STRF "'\n", 1780 REST(ctx->re))); 1781 ctx->re++; 1782 bit = 0; 1783 } 1784 else if (*ctx->re == CHAR_COLON) 1785 { 1786 DPRINT(("tre_parse: no group: '%.*" STRF 1787 "', (invisible submatch %d)\n", 1788 REST(ctx->re), ctx->submatch_id_invisible)); 1789 ctx->re++; 1790 depth++; 1791 invisible_submatch = 1; 1792 break; 1793 } 1794 else if (*ctx->re == CHAR_HASH) 1795 { 1796 DPRINT(("tre_parse: comment: '%.*" STRF "'\n", 1797 REST(ctx->re))); 1798 /* A comment can contain any character except a 1799 right parenthesis */ 1800 while (*ctx->re != CHAR_RPAREN 1801 && ctx->re < ctx->re_end) 1802 ctx->re++; 1803 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end) 1804 { 1805 ctx->re++; 1806 break; 1807 } 1808 else 1809 return REG_BADPAT; 1810 } 1811 else if (*ctx->re == CHAR_RPAREN) 1812 { 1813 ctx->re++; 1814 break; 1815 } 1816 else 1817 return REG_BADRPT; 1818 } 1819 1820 /* Turn on the cflags changes for the rest of the 1821 enclosing group. */ 1822 if (invisible_submatch) 1823 { 1824 STACK_PUSHX(stack, int, ctx->cflags); 1825 STACK_PUSHX(stack, int, ctx->submatch_id_invisible); 1826 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); 1827 ctx->submatch_id_invisible++; 1828 STACK_PUSHX(stack, int, 0); // bre_branch_begin 1829 STACK_PUSHX(stack, int, PARSE_RE); 1830 } 1831 else { 1832 STACK_PUSHX(stack, int, 0); // bre_branch_begin 1833 STACK_PUSHX(stack, int, PARSE_ATOM); 1834 } 1835 ctx->cflags = new_cflags; 1836 break; 1837 } 1838 1839 if (ctx->cflags & REG_EXTENDED) 1840 { 1841 parse_bre_lparen: 1842 DPRINT(("tre_parse: group begin: '%.*" STRF 1843 "', submatch %d\n", REST(ctx->re), 1844 ctx->submatch_id)); 1845 ctx->re++; 1846 /* First parse a whole RE, then mark the resulting tree 1847 for submatching. */ 1848 STACK_PUSHX(stack, int, ctx->cflags); 1849 STACK_PUSHX(stack, int, ctx->submatch_id); 1850 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); 1851 /* We need to pass a boolean (eventually) to PARSE_ATOM to 1852 indicate if this is the beginning of a BRE group. */ 1853 STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED)); 1854 STACK_PUSHX(stack, int, PARSE_RE); 1855 ctx->submatch_id++; 1856 depth++; 1857 } 1858 else 1859 goto parse_literal; 1860 break; 1861 1862 case CHAR_RPAREN: /* end of current subexpression */ 1863 if (ctx->cflags & REG_EXTENDED && depth > 0) 1864 { 1865 parse_bre_rparen_empty: 1866 if (!(ctx->cflags & REG_EXTENDED) && depth == 0) 1867 return REG_EPAREN; 1868 DPRINT(("tre_parse: empty: '%.*" STRF "'\n", 1869 REST(ctx->re))); 1870 /* We were expecting an atom, but instead the current 1871 subexpression was closed. POSIX leaves the meaning of 1872 this to be implementation-defined. We interpret this as 1873 an empty expression (which matches an empty string). */ 1874 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1875 if (result == NULL) 1876 return REG_ESPACE; 1877 if (!(ctx->cflags & REG_EXTENDED)) 1878 ctx->re--; 1879 } 1880 else 1881 goto parse_literal; 1882 break; 1883 1884 case CHAR_LBRACKET: /* bracket expression */ 1885 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", 1886 REST(ctx->re))); 1887 ctx->re++; 1888 status = tre_parse_bracket(ctx, &result); 1889 if (status != REG_OK) 1890 return status; 1891 break; 1892 1893 case CHAR_BACKSLASH: 1894 /* Deal with "\(", "\)" or "\{" for BREs */ 1895 if (!(ctx->cflags & REG_EXTENDED) 1896 && ctx->re + 1 < ctx->re_end) 1897 { 1898 if (*(ctx->re + 1) == CHAR_LPAREN) 1899 { 1900 ctx->re++; 1901 goto parse_bre_lparen; 1902 } 1903 else if (*(ctx->re + 1) == CHAR_RPAREN) 1904 { 1905 ctx->re++; 1906 goto parse_bre_rparen_empty; 1907 } 1908 if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal; 1909 } 1910 1911 if (ctx->re + 1 >= ctx->re_end) 1912 /* Trailing backslash. */ 1913 return REG_EESCAPE; 1914 1915 if (!(ctx->cflags & REG_ENHANCED)) 1916 { 1917 DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re))); 1918 ctx->re++; 1919 goto unenhanced_backslash; 1920 } 1921 1922 /* If a macro is used, parse the expanded macro recursively. */ 1923 { 1924 tre_char_t buf[64]; 1925 tre_expand_macro(ctx->re + 1, ctx->re_end, 1926 buf, elementsof(buf)); 1927 if (buf[0] != 0) 1928 { 1929 tre_parse_ctx_t subctx; 1930 memcpy(&subctx, ctx, sizeof(subctx)); 1931 subctx.re = buf; 1932 subctx.len = tre_strlen(buf); 1933 subctx.nofirstsub = 1; 1934 status = tre_parse(&subctx); 1935 if (status != REG_OK) 1936 return status; 1937 ctx->re += 2; 1938 ctx->position = subctx.position; 1939 result = subctx.result; 1940 break; 1941 } 1942 } 1943 1944#ifdef REG_LITERAL 1945 if (*(ctx->re + 1) == L'Q') 1946 { 1947 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n", 1948 REST(ctx->re))); 1949 ctx->cflags |= REG_LITERAL; 1950 temporary_cflags |= REG_LITERAL; 1951 ctx->re += 2; 1952 STACK_PUSHX(stack, int, 0); 1953 STACK_PUSHX(stack, int, PARSE_ATOM); 1954 break; 1955 } 1956#endif /* REG_LITERAL */ 1957 1958 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re))); 1959 ctx->re++; 1960 switch (*ctx->re) 1961 { 1962 case L'b': 1963 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1964 ASSERT_AT_WB, -1); 1965 ctx->re++; 1966 break; 1967 case L'B': 1968 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1969 ASSERT_AT_WB_NEG, -1); 1970 ctx->re++; 1971 break; 1972 case L'<': 1973 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1974 ASSERT_AT_BOW, -1); 1975 ctx->re++; 1976 break; 1977 case L'>': 1978 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1979 ASSERT_AT_EOW, -1); 1980 ctx->re++; 1981 break; 1982 case L'x': 1983 ctx->re++; 1984 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end) 1985 { 1986 /* 8 bit hex char. */ 1987 char tmp[3] = {0, 0, 0}; 1988 long val; 1989 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", 1990 REST(ctx->re - 2))); 1991 1992 if (tre_isxdigit_l(ctx->re[0], ctx->loc) && 1993 ctx->re < ctx->re_end) 1994 { 1995 tmp[0] = (char)ctx->re[0]; 1996 ctx->re++; 1997 } 1998 if (tre_isxdigit_l(ctx->re[0], ctx->loc) && 1999 ctx->re < ctx->re_end) 2000 { 2001 tmp[1] = (char)ctx->re[0]; 2002 ctx->re++; 2003 } 2004 val = strtol(tmp, NULL, 16); 2005 result = tre_ast_new_literal(ctx->mem, (int)val, 2006 (int)val, ctx->position); 2007 ctx->position++; 2008 break; 2009 } 2010 else if (ctx->re < ctx->re_end) 2011 { 2012 /* Wide char. */ 2013 char tmp[32]; 2014 long val; 2015 int i = 0; 2016 ctx->re++; 2017 while (ctx->re_end - ctx->re >= 0) 2018 { 2019 if (ctx->re[0] == CHAR_RBRACE) 2020 break; 2021 if (tre_isxdigit_l(ctx->re[0], ctx->loc)) 2022 { 2023 tmp[i] = (char)ctx->re[0]; 2024 i++; 2025 ctx->re++; 2026 continue; 2027 } 2028 return REG_EBRACE; 2029 } 2030 ctx->re++; 2031 tmp[i] = 0; 2032 val = strtol(tmp, NULL, 16); 2033 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, 2034 ctx->position); 2035 ctx->position++; 2036 break; 2037 } 2038 /*FALLTHROUGH*/ 2039 2040 default: 2041 unenhanced_backslash: 2042 if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) != 2043 REG_EXTENDED && 2044 tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0') 2045 { 2046 /* Back reference (only in BRE or enhanced). */ 2047 int val = *ctx->re - L'0'; 2048 DPRINT(("tre_parse: backref: '%.*" STRF "'\n", 2049 REST(ctx->re - 1))); 2050 result = tre_ast_new_literal(ctx->mem, BACKREF, val, 2051 ctx->position); 2052 if (result == NULL) 2053 return REG_ESPACE; 2054 2055 /* Set the backref with a submatch id in the invisible 2056 * range (which will be overridden if a real submatch 2057 * is needed) */ 2058 result->submatch_id = ctx->submatch_id_invisible++; 2059 2060 ctx->position++; 2061 ctx->num_reorder_tags++; 2062 ctx->max_backref = MAX(val, ctx->max_backref); 2063 ctx->re++; 2064 } 2065 else 2066 { 2067 /* Escaped character. */ 2068 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", 2069 REST(ctx->re - 1))); 2070 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, 2071 ctx->position); 2072 ctx->position++; 2073 ctx->re++; 2074 } 2075 break; 2076 } 2077 if (result == NULL) 2078 return REG_ESPACE; 2079 break; 2080 2081 case CHAR_PERIOD: /* the any-symbol */ 2082 DPRINT(("tre_parse: any: '%.*" STRF "'\n", 2083 REST(ctx->re))); 2084 if (ctx->cflags & REG_NEWLINE) 2085 { 2086 tre_ast_node_t *tmp1; 2087 tre_ast_node_t *tmp2; 2088 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1, 2089 ctx->position); 2090 if (!tmp1) 2091 return REG_ESPACE; 2092 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX, 2093 ctx->position + 1); 2094 if (!tmp2) 2095 return REG_ESPACE; 2096 result = tre_ast_new_union(ctx->mem, tmp1, tmp2); 2097 if (!result) 2098 return REG_ESPACE; 2099 ctx->position += 2; 2100 } 2101 else 2102 { 2103 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, 2104 ctx->position); 2105 if (!result) 2106 return REG_ESPACE; 2107 ctx->position++; 2108 } 2109 ctx->re++; 2110 break; 2111 2112 case CHAR_CARET: /* beginning of line assertion */ 2113 /* '^' has a special meaning everywhere in EREs, at the 2114 beginning of the RE and after \( is BREs. It is also 2115 special in enhanced BREs at the beginning of each branches 2116 of a union */ 2117 if (ctx->cflags & REG_EXTENDED 2118 || bre_branch_begin 2119 || ctx->re == ctx->re_start) 2120 { 2121 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", 2122 REST(ctx->re))); 2123 result = tre_ast_new_literal(ctx->mem, ASSERTION, 2124 ASSERT_AT_BOL, -1); 2125 if (result == NULL) 2126 return REG_ESPACE; 2127 ctx->re++; 2128 } 2129 else 2130 goto parse_literal; 2131 break; 2132 2133 case CHAR_DOLLAR: /* end of line assertion. */ 2134 /* '$' is special everywhere in EREs, and in the end of the 2135 string and before \) is BREs. */ 2136 if (ctx->cflags & REG_EXTENDED 2137 || (ctx->re + 2 < ctx->re_end 2138 && *(ctx->re + 1) == CHAR_BACKSLASH 2139 && *(ctx->re + 2) == CHAR_RPAREN) 2140 || ctx->re + 1 == ctx->re_end) 2141 { 2142 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", 2143 REST(ctx->re))); 2144 result = tre_ast_new_literal(ctx->mem, ASSERTION, 2145 ASSERT_AT_EOL, -1); 2146 if (result == NULL) 2147 return REG_ESPACE; 2148 ctx->re++; 2149 } 2150 else 2151 goto parse_literal; 2152 break; 2153 2154 default: 2155 parse_literal: 2156 2157 if (temporary_cflags && ctx->re + 1 < ctx->re_end 2158 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E') 2159 { 2160 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n", 2161 REST(ctx->re))); 2162 ctx->cflags &= ~temporary_cflags; 2163 temporary_cflags = 0; 2164 ctx->re += 2; 2165 if (ctx->re < ctx->re_end) 2166 { 2167 STACK_PUSHX(stack, int, 0); 2168 STACK_PUSHX(stack, int, PARSE_ATOM); 2169 } 2170 else 2171 { 2172 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 2173 if (!result) return REG_ESPACE; 2174 } 2175 break; 2176 } 2177 2178 2179 /* We are expecting an atom. If the subexpression (or the whole 2180 regexp ends here, we interpret it as an empty expression 2181 (which matches an empty string), which is an error. 2182 Iterations of an empty expression is also an error. */ 2183#ifdef REG_LITERAL 2184 if (!(ctx->cflags & REG_LITERAL)) 2185 { 2186#endif /* REG_LITERAL */ 2187 /* error on end of string */ 2188 if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN 2189 : REG_EMPTY; 2190 /* error on unions and iterations of empty expressions */ 2191 if (ctx->cflags & REG_EXTENDED) 2192 { 2193 if (ctx->re < ctx->re_end) 2194 { 2195 if (*ctx->re == CHAR_PIPE) return REG_EMPTY; 2196 if (*ctx->re == CHAR_LBRACE) 2197 { 2198 ctx->re++; 2199 empty_parse_bound: 2200 /* We need to parse the bound first and return 2201 any error, before returning REG_BADRPT */ 2202 status = tre_parse_bound(ctx, NULL); 2203#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 2204 /* For ERE, if REG_NOMATCH is returned, we 2205 treat the lbrace as a literal. */ 2206 if (status == REG_NOMATCH) 2207 { 2208 ctx->re--; 2209 /* Drop down to literal-handling code */ 2210 } 2211 else 2212 { 2213#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 2214 if (status != REG_OK) 2215 return status; 2216 return REG_BADRPT; 2217#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 2218 } 2219#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 2220 } 2221#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 2222 else 2223#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 2224 if (*ctx->re == CHAR_STAR 2225 || *ctx->re == CHAR_PLUS 2226 || *ctx->re == CHAR_QUESTIONMARK) 2227 { 2228 return REG_BADRPT; 2229 } 2230 } 2231 } 2232 else if (ctx->re + 1 < ctx->re_end 2233 && *ctx->re == CHAR_BACKSLASH 2234 && *(ctx->re + 1) == CHAR_LBRACE) 2235 { 2236 ctx->re += 2; 2237 goto empty_parse_bound; 2238 } 2239#ifdef REG_LITERAL 2240 } 2241#endif /* REG_LITERAL */ 2242 2243 DPRINT(("tre_parse: literal: '%.*" STRF "'\n", 2244 REST(ctx->re))); 2245 /* Note that we can't use an tre_isalpha() test here, since there 2246 may be characters which are alphabetic but neither upper or 2247 lower case. */ 2248 if (ctx->cflags & REG_ICASE 2249 && (tre_isupper_l(*ctx->re, ctx->loc) || 2250 tre_islower_l(*ctx->re, ctx->loc))) 2251 { 2252 tre_ast_node_t *tmp1; 2253 tre_ast_node_t *tmp2; 2254 2255 /* XXX - Can there be more than one opposite-case 2256 counterpoints for some character in some locale? Or 2257 more than two characters which all should be regarded 2258 the same character if case is ignored? If yes, there 2259 does not seem to be a portable way to detect it. I guess 2260 that at least for multi-character collating elements there 2261 could be several opposite-case counterpoints, but they 2262 cannot be supported portably anyway. */ 2263 tmp1 = tre_ast_new_literal(ctx->mem, 2264 tre_toupper_l(*ctx->re, ctx->loc), 2265 tre_toupper_l(*ctx->re, ctx->loc), 2266 ctx->position); 2267 if (!tmp1) 2268 return REG_ESPACE; 2269 tmp2 = tre_ast_new_literal(ctx->mem, 2270 tre_tolower_l(*ctx->re, ctx->loc), 2271 tre_tolower_l(*ctx->re, ctx->loc), 2272 ctx->position); 2273 if (!tmp2) 2274 return REG_ESPACE; 2275 result = tre_ast_new_union(ctx->mem, tmp1, tmp2); 2276 if (!result) 2277 return REG_ESPACE; 2278 } 2279 else 2280 { 2281 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, 2282 ctx->position); 2283 if (!result) 2284 return REG_ESPACE; 2285 } 2286 ctx->position++; 2287 ctx->re++; 2288 break; 2289 } 2290 break; 2291 } 2292 2293 case PARSE_MARK_FOR_SUBMATCH: 2294 { 2295 int submatch_id = tre_stack_pop_int(stack); 2296 2297 ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */ 2298 if (result->submatch_id >= 0 && 2299 result->submatch_id < SUBMATCH_ID_INVISIBLE_START) 2300 { 2301 tre_ast_node_t *n, *tmp_node; 2302 if (submatch_id >= SUBMATCH_ID_INVISIBLE_START) 2303 break; 2304 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 2305 if (n == NULL) 2306 return REG_ESPACE; 2307 tmp_node = tre_ast_new_catenation(ctx->mem, n, result); 2308 if (tmp_node == NULL) 2309 return REG_ESPACE; 2310 tmp_node->num_submatches = result->num_submatches; 2311 result = tmp_node; 2312 } 2313 result->submatch_id = submatch_id; 2314 if (submatch_id < SUBMATCH_ID_INVISIBLE_START) 2315 result->num_submatches++; 2316 break; 2317 } 2318 2319 default: 2320 assert(0); 2321 break; 2322 } 2323 } 2324 2325 /* Check for missing closing parentheses. */ 2326 if (depth > 0) 2327 return REG_EPAREN; 2328 2329 ctx->result = result; 2330 2331 return REG_OK; 2332} 2333 2334/* EOF */ 2335