1/* 2 regcomp.c - TRE POSIX compatible regex compilation functions. 3 4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> 5 All rights reserved. 6 7 Redistribution and use in source and binary forms, with or without 8 modification, are permitted provided that the following conditions 9 are met: 10 11 1. Redistributions of source code must retain the above copyright 12 notice, this list of conditions and the following disclaimer. 13 14 2. Redistributions in binary form must reproduce the above copyright 15 notice, this list of conditions and the following disclaimer in the 16 documentation and/or other materials provided with the distribution. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30*/ 31 32#include <string.h> 33#include <stdlib.h> 34#include <regex.h> 35#include <limits.h> 36#include <stdint.h> 37#include <ctype.h> 38 39#include "tre.h" 40 41#include <assert.h> 42 43/*********************************************************************** 44 from tre-compile.h 45***********************************************************************/ 46 47typedef struct { 48 int position; 49 int code_min; 50 int code_max; 51 int *tags; 52 int assertions; 53 tre_ctype_t class; 54 tre_ctype_t *neg_classes; 55 int backref; 56} tre_pos_and_tags_t; 57 58 59/*********************************************************************** 60 from tre-ast.c and tre-ast.h 61***********************************************************************/ 62 63/* The different AST node types. */ 64typedef enum { 65 LITERAL, 66 CATENATION, 67 ITERATION, 68 UNION 69} tre_ast_type_t; 70 71/* Special subtypes of TRE_LITERAL. */ 72#define EMPTY -1 /* Empty leaf (denotes empty string). */ 73#define ASSERTION -2 /* Assertion leaf. */ 74#define TAG -3 /* Tag leaf. */ 75#define BACKREF -4 /* Back reference leaf. */ 76 77#define IS_SPECIAL(x) ((x)->code_min < 0) 78#define IS_EMPTY(x) ((x)->code_min == EMPTY) 79#define IS_ASSERTION(x) ((x)->code_min == ASSERTION) 80#define IS_TAG(x) ((x)->code_min == TAG) 81#define IS_BACKREF(x) ((x)->code_min == BACKREF) 82 83 84/* A generic AST node. All AST nodes consist of this node on the top 85 level with `obj' pointing to the actual content. */ 86typedef struct { 87 tre_ast_type_t type; /* Type of the node. */ 88 void *obj; /* Pointer to actual node. */ 89 int nullable; 90 int submatch_id; 91 int num_submatches; 92 int num_tags; 93 tre_pos_and_tags_t *firstpos; 94 tre_pos_and_tags_t *lastpos; 95} tre_ast_node_t; 96 97 98/* A "literal" node. These are created for assertions, back references, 99 tags, matching parameter settings, and all expressions that match one 100 character. */ 101typedef struct { 102 long code_min; 103 long code_max; 104 int position; 105 tre_ctype_t class; 106 tre_ctype_t *neg_classes; 107} tre_literal_t; 108 109/* A "catenation" node. These are created when two regexps are concatenated. 110 If there are more than one subexpressions in sequence, the `left' part 111 holds all but the last, and `right' part holds the last subexpression 112 (catenation is left associative). */ 113typedef struct { 114 tre_ast_node_t *left; 115 tre_ast_node_t *right; 116} tre_catenation_t; 117 118/* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}" 119 operators. */ 120typedef struct { 121 /* Subexpression to match. */ 122 tre_ast_node_t *arg; 123 /* Minimum number of consecutive matches. */ 124 int min; 125 /* Maximum number of consecutive matches. */ 126 int max; 127 /* If 0, match as many characters as possible, if 1 match as few as 128 possible. Note that this does not always mean the same thing as 129 matching as many/few repetitions as possible. */ 130 unsigned int minimal:1; 131} tre_iteration_t; 132 133/* An "union" node. These are created for the "|" operator. */ 134typedef struct { 135 tre_ast_node_t *left; 136 tre_ast_node_t *right; 137} tre_union_t; 138 139 140static tre_ast_node_t * 141tre_ast_new_node(tre_mem_t mem, int type, void *obj) 142{ 143 tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node); 144 if (!node || !obj) 145 return 0; 146 node->obj = obj; 147 node->type = type; 148 node->nullable = -1; 149 node->submatch_id = -1; 150 return node; 151} 152 153static tre_ast_node_t * 154tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position) 155{ 156 tre_ast_node_t *node; 157 tre_literal_t *lit; 158 159 lit = tre_mem_calloc(mem, sizeof *lit); 160 node = tre_ast_new_node(mem, LITERAL, lit); 161 if (!node) 162 return 0; 163 lit->code_min = code_min; 164 lit->code_max = code_max; 165 lit->position = position; 166 return node; 167} 168 169static tre_ast_node_t * 170tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal) 171{ 172 tre_ast_node_t *node; 173 tre_iteration_t *iter; 174 175 iter = tre_mem_calloc(mem, sizeof *iter); 176 node = tre_ast_new_node(mem, ITERATION, iter); 177 if (!node) 178 return 0; 179 iter->arg = arg; 180 iter->min = min; 181 iter->max = max; 182 iter->minimal = minimal; 183 node->num_submatches = arg->num_submatches; 184 return node; 185} 186 187static tre_ast_node_t * 188tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right) 189{ 190 tre_ast_node_t *node; 191 tre_union_t *un; 192 193 if (!left) 194 return right; 195 un = tre_mem_calloc(mem, sizeof *un); 196 node = tre_ast_new_node(mem, UNION, un); 197 if (!node || !right) 198 return 0; 199 un->left = left; 200 un->right = right; 201 node->num_submatches = left->num_submatches + right->num_submatches; 202 return node; 203} 204 205static tre_ast_node_t * 206tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right) 207{ 208 tre_ast_node_t *node; 209 tre_catenation_t *cat; 210 211 if (!left) 212 return right; 213 cat = tre_mem_calloc(mem, sizeof *cat); 214 node = tre_ast_new_node(mem, CATENATION, cat); 215 if (!node) 216 return 0; 217 cat->left = left; 218 cat->right = right; 219 node->num_submatches = left->num_submatches + right->num_submatches; 220 return node; 221} 222 223 224/*********************************************************************** 225 from tre-stack.c and tre-stack.h 226***********************************************************************/ 227 228typedef struct tre_stack_rec tre_stack_t; 229 230/* Creates a new stack object. `size' is initial size in bytes, `max_size' 231 is maximum size, and `increment' specifies how much more space will be 232 allocated with realloc() if all space gets used up. Returns the stack 233 object or NULL if out of memory. */ 234static tre_stack_t * 235tre_stack_new(int size, int max_size, int increment); 236 237/* Frees the stack object. */ 238static void 239tre_stack_destroy(tre_stack_t *s); 240 241/* Returns the current number of objects in the stack. */ 242static int 243tre_stack_num_objects(tre_stack_t *s); 244 245/* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes 246 `value' on top of stack `s'. Returns REG_ESPACE if out of memory. 247 This tries to realloc() more space before failing if maximum size 248 has not yet been reached. Returns REG_OK if successful. */ 249#define declare_pushf(typetag, type) \ 250 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value) 251 252declare_pushf(voidptr, void *); 253declare_pushf(int, int); 254 255/* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost 256 element off of stack `s' and returns it. The stack must not be 257 empty. */ 258#define declare_popf(typetag, type) \ 259 static type tre_stack_pop_ ## typetag(tre_stack_t *s) 260 261declare_popf(voidptr, void *); 262declare_popf(int, int); 263 264/* Just to save some typing. */ 265#define STACK_PUSH(s, typetag, value) \ 266 do \ 267 { \ 268 status = tre_stack_push_ ## typetag(s, value); \ 269 } \ 270 while (/*CONSTCOND*/0) 271 272#define STACK_PUSHX(s, typetag, value) \ 273 { \ 274 status = tre_stack_push_ ## typetag(s, value); \ 275 if (status != REG_OK) \ 276 break; \ 277 } 278 279#define STACK_PUSHR(s, typetag, value) \ 280 { \ 281 reg_errcode_t _status; \ 282 _status = tre_stack_push_ ## typetag(s, value); \ 283 if (_status != REG_OK) \ 284 return _status; \ 285 } 286 287union tre_stack_item { 288 void *voidptr_value; 289 int int_value; 290}; 291 292struct tre_stack_rec { 293 int size; 294 int max_size; 295 int increment; 296 int ptr; 297 union tre_stack_item *stack; 298}; 299 300 301static tre_stack_t * 302tre_stack_new(int size, int max_size, int increment) 303{ 304 tre_stack_t *s; 305 306 s = xmalloc(sizeof(*s)); 307 if (s != NULL) 308 { 309 s->stack = xmalloc(sizeof(*s->stack) * size); 310 if (s->stack == NULL) 311 { 312 xfree(s); 313 return NULL; 314 } 315 s->size = size; 316 s->max_size = max_size; 317 s->increment = increment; 318 s->ptr = 0; 319 } 320 return s; 321} 322 323static void 324tre_stack_destroy(tre_stack_t *s) 325{ 326 xfree(s->stack); 327 xfree(s); 328} 329 330static int 331tre_stack_num_objects(tre_stack_t *s) 332{ 333 return s->ptr; 334} 335 336static reg_errcode_t 337tre_stack_push(tre_stack_t *s, union tre_stack_item value) 338{ 339 if (s->ptr < s->size) 340 { 341 s->stack[s->ptr] = value; 342 s->ptr++; 343 } 344 else 345 { 346 if (s->size >= s->max_size) 347 { 348 return REG_ESPACE; 349 } 350 else 351 { 352 union tre_stack_item *new_buffer; 353 int new_size; 354 new_size = s->size + s->increment; 355 if (new_size > s->max_size) 356 new_size = s->max_size; 357 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size); 358 if (new_buffer == NULL) 359 { 360 return REG_ESPACE; 361 } 362 assert(new_size > s->size); 363 s->size = new_size; 364 s->stack = new_buffer; 365 tre_stack_push(s, value); 366 } 367 } 368 return REG_OK; 369} 370 371#define define_pushf(typetag, type) \ 372 declare_pushf(typetag, type) { \ 373 union tre_stack_item item; \ 374 item.typetag ## _value = value; \ 375 return tre_stack_push(s, item); \ 376} 377 378define_pushf(int, int) 379define_pushf(voidptr, void *) 380 381#define define_popf(typetag, type) \ 382 declare_popf(typetag, type) { \ 383 return s->stack[--s->ptr].typetag ## _value; \ 384 } 385 386define_popf(int, int) 387define_popf(voidptr, void *) 388 389 390/*********************************************************************** 391 from tre-parse.c and tre-parse.h 392***********************************************************************/ 393 394/* Parse context. */ 395typedef struct { 396 /* Memory allocator. The AST is allocated using this. */ 397 tre_mem_t mem; 398 /* Stack used for keeping track of regexp syntax. */ 399 tre_stack_t *stack; 400 /* The parsed node after a parse function returns. */ 401 tre_ast_node_t *n; 402 /* Position in the regexp pattern after a parse function returns. */ 403 const char *s; 404 /* The first character of the last subexpression parsed. */ 405 const char *start; 406 /* Current submatch ID. */ 407 int submatch_id; 408 /* Current position (number of literal). */ 409 int position; 410 /* The highest back reference or -1 if none seen so far. */ 411 int max_backref; 412 /* Compilation flags. */ 413 int cflags; 414} tre_parse_ctx_t; 415 416/* Some macros for expanding \w, \s, etc. */ 417static const struct { 418 char c; 419 const char *expansion; 420} tre_macros[] = { 421 {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, 422 {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, 423 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, 424 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, 425 { 0, 0 } 426}; 427 428/* Expands a macro delimited by `regex' and `regex_end' to `buf', which 429 must have at least `len' items. Sets buf[0] to zero if the there 430 is no match in `tre_macros'. */ 431static const char *tre_expand_macro(const char *s) 432{ 433 int i; 434 for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++); 435 return tre_macros[i].expansion; 436} 437 438static int 439tre_compare_lit(const void *a, const void *b) 440{ 441 const tre_literal_t *const *la = a; 442 const tre_literal_t *const *lb = b; 443 /* assumes the range of valid code_min is < INT_MAX */ 444 return la[0]->code_min - lb[0]->code_min; 445} 446 447struct literals { 448 tre_mem_t mem; 449 tre_literal_t **a; 450 int len; 451 int cap; 452}; 453 454static tre_literal_t *tre_new_lit(struct literals *p) 455{ 456 tre_literal_t **a; 457 if (p->len >= p->cap) { 458 if (p->cap >= 1<<15) 459 return 0; 460 p->cap *= 2; 461 a = xrealloc(p->a, p->cap * sizeof *p->a); 462 if (!a) 463 return 0; 464 p->a = a; 465 } 466 a = p->a + p->len++; 467 *a = tre_mem_calloc(p->mem, sizeof **a); 468 return *a; 469} 470 471static int add_icase_literals(struct literals *ls, int min, int max) 472{ 473 tre_literal_t *lit; 474 int b, e, c; 475 for (c=min; c<=max; ) { 476 /* assumes islower(c) and isupper(c) are exclusive 477 and toupper(c)!=c if islower(c). 478 multiple opposite case characters are not supported */ 479 if (tre_islower(c)) { 480 b = e = tre_toupper(c); 481 for (c++, e++; c<=max; c++, e++) 482 if (tre_toupper(c) != e) break; 483 } else if (tre_isupper(c)) { 484 b = e = tre_tolower(c); 485 for (c++, e++; c<=max; c++, e++) 486 if (tre_tolower(c) != e) break; 487 } else { 488 c++; 489 continue; 490 } 491 lit = tre_new_lit(ls); 492 if (!lit) 493 return -1; 494 lit->code_min = b; 495 lit->code_max = e-1; 496 lit->position = -1; 497 } 498 return 0; 499} 500 501 502/* Maximum number of character classes in a negated bracket expression. */ 503#define MAX_NEG_CLASSES 64 504 505struct neg { 506 int negate; 507 int len; 508 tre_ctype_t a[MAX_NEG_CLASSES]; 509}; 510 511// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges 512 513/* 514bracket grammar: 515Bracket = '[' List ']' | '[^' List ']' 516List = Term | List Term 517Term = Char | Range | Chclass | Eqclass 518Range = Char '-' Char | Char '-' '-' 519Char = Coll | coll_single 520Meta = ']' | '-' 521Coll = '[.' coll_single '.]' | '[.' coll_multi '.]' | '[.' Meta '.]' 522Eqclass = '[=' coll_single '=]' | '[=' coll_multi '=]' 523Chclass = '[:' class ':]' 524 525coll_single is a single char collating element but it can be 526 '-' only at the beginning or end of a List and 527 ']' only at the beginning of a List and 528 '^' anywhere except after the openning '[' 529*/ 530 531static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg) 532{ 533 const char *start = s; 534 tre_ctype_t class; 535 int min, max; 536 wchar_t wc; 537 int len; 538 539 for (;;) { 540 class = 0; 541 len = mbtowc(&wc, s, -1); 542 if (len <= 0) 543 return *s ? REG_BADPAT : REG_EBRACK; 544 if (*s == ']' && s != start) { 545 ctx->s = s+1; 546 return REG_OK; 547 } 548 if (*s == '-' && s != start && s[1] != ']' && 549 /* extension: [a-z--@] is accepted as [a-z]|[--@] */ 550 (s[1] != '-' || s[2] == ']')) 551 return REG_ERANGE; 552 if (*s == '[' && (s[1] == '.' || s[1] == '=')) 553 /* collating symbols and equivalence classes are not supported */ 554 return REG_ECOLLATE; 555 if (*s == '[' && s[1] == ':') { 556 char tmp[CHARCLASS_NAME_MAX+1]; 557 s += 2; 558 for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) { 559 if (s[len] == ':') { 560 memcpy(tmp, s, len); 561 tmp[len] = 0; 562 class = tre_ctype(tmp); 563 break; 564 } 565 } 566 if (!class || s[len+1] != ']') 567 return REG_ECTYPE; 568 min = 0; 569 max = TRE_CHAR_MAX; 570 s += len+2; 571 } else { 572 min = max = wc; 573 s += len; 574 if (*s == '-' && s[1] != ']') { 575 s++; 576 len = mbtowc(&wc, s, -1); 577 max = wc; 578 /* XXX - Should use collation order instead of 579 encoding values in character ranges. */ 580 if (len <= 0 || min > max) 581 return REG_ERANGE; 582 s += len; 583 } 584 } 585 586 if (class && neg->negate) { 587 if (neg->len >= MAX_NEG_CLASSES) 588 return REG_ESPACE; 589 neg->a[neg->len++] = class; 590 } else { 591 tre_literal_t *lit = tre_new_lit(ls); 592 if (!lit) 593 return REG_ESPACE; 594 lit->code_min = min; 595 lit->code_max = max; 596 lit->class = class; 597 lit->position = -1; 598 599 /* Add opposite-case codepoints if REG_ICASE is present. 600 It seems that POSIX requires that bracket negation 601 should happen before case-folding, but most practical 602 implementations do it the other way around. Changing 603 the order would need efficient representation of 604 case-fold ranges and bracket range sets even with 605 simple patterns so this is ok for now. */ 606 if (ctx->cflags & REG_ICASE && !class) 607 if (add_icase_literals(ls, min, max)) 608 return REG_ESPACE; 609 } 610 } 611} 612 613static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s) 614{ 615 int i, max, min, negmax, negmin; 616 tre_ast_node_t *node = 0, *n; 617 tre_ctype_t *nc = 0; 618 tre_literal_t *lit; 619 struct literals ls; 620 struct neg neg; 621 reg_errcode_t err; 622 623 ls.mem = ctx->mem; 624 ls.len = 0; 625 ls.cap = 32; 626 ls.a = xmalloc(ls.cap * sizeof *ls.a); 627 if (!ls.a) 628 return REG_ESPACE; 629 neg.len = 0; 630 neg.negate = *s == '^'; 631 if (neg.negate) 632 s++; 633 634 err = parse_bracket_terms(ctx, s, &ls, &neg); 635 if (err != REG_OK) 636 goto parse_bracket_done; 637 638 if (neg.negate) { 639 /* Sort the array if we need to negate it. */ 640 qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit); 641 /* extra lit for the last negated range */ 642 lit = tre_new_lit(&ls); 643 if (!lit) { 644 err = REG_ESPACE; 645 goto parse_bracket_done; 646 } 647 lit->code_min = TRE_CHAR_MAX+1; 648 lit->code_max = TRE_CHAR_MAX+1; 649 lit->position = -1; 650 /* negated classes */ 651 if (neg.len) { 652 nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a); 653 if (!nc) { 654 err = REG_ESPACE; 655 goto parse_bracket_done; 656 } 657 memcpy(nc, neg.a, neg.len*sizeof *neg.a); 658 nc[neg.len] = 0; 659 } 660 } 661 662 /* Build a union of the items in the array, negated if necessary. */ 663 negmax = negmin = 0; 664 for (i = 0; i < ls.len; i++) { 665 lit = ls.a[i]; 666 min = lit->code_min; 667 max = lit->code_max; 668 if (neg.negate) { 669 if (min <= negmin) { 670 /* Overlap. */ 671 negmin = MAX(max + 1, negmin); 672 continue; 673 } 674 negmax = min - 1; 675 lit->code_min = negmin; 676 lit->code_max = negmax; 677 negmin = max + 1; 678 } 679 lit->position = ctx->position; 680 lit->neg_classes = nc; 681 n = tre_ast_new_node(ctx->mem, LITERAL, lit); 682 node = tre_ast_new_union(ctx->mem, node, n); 683 if (!node) { 684 err = REG_ESPACE; 685 break; 686 } 687 } 688 689parse_bracket_done: 690 xfree(ls.a); 691 ctx->position++; 692 ctx->n = node; 693 return err; 694} 695 696static const char *parse_dup_count(const char *s, int *n) 697{ 698 *n = -1; 699 if (!isdigit(*s)) 700 return s; 701 *n = 0; 702 for (;;) { 703 *n = 10 * *n + (*s - '0'); 704 s++; 705 if (!isdigit(*s) || *n > RE_DUP_MAX) 706 break; 707 } 708 return s; 709} 710 711static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax) 712{ 713 int min, max; 714 715 s = parse_dup_count(s, &min); 716 if (*s == ',') 717 s = parse_dup_count(s+1, &max); 718 else 719 max = min; 720 721 if ( 722 (max < min && max >= 0) || 723 max > RE_DUP_MAX || 724 min > RE_DUP_MAX || 725 min < 0 || 726 (!ere && *s++ != '\\') || 727 *s++ != '}' 728 ) 729 return 0; 730 *pmin = min; 731 *pmax = max; 732 return s; 733} 734 735static int hexval(unsigned c) 736{ 737 if (c-'0'<10) return c-'0'; 738 c |= 32; 739 if (c-'a'<6) return c-'a'+10; 740 return -1; 741} 742 743static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid) 744{ 745 if (node->submatch_id >= 0) { 746 tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 747 if (!n) 748 return REG_ESPACE; 749 n = tre_ast_new_catenation(ctx->mem, n, node); 750 if (!n) 751 return REG_ESPACE; 752 n->num_submatches = node->num_submatches; 753 node = n; 754 } 755 node->submatch_id = subid; 756 node->num_submatches++; 757 ctx->n = node; 758 return REG_OK; 759} 760 761/* 762BRE grammar: 763Regex = Branch | '^' | '$' | '^$' | '^' Branch | Branch '$' | '^' Branch '$' 764Branch = Atom | Branch Atom 765Atom = char | quoted_char | '.' | Bracket | Atom Dup | '\(' Branch '\)' | back_ref 766Dup = '*' | '\{' Count '\}' | '\{' Count ',\}' | '\{' Count ',' Count '\}' 767 768(leading ^ and trailing $ in a sub expr may be an anchor or literal as well) 769 770ERE grammar: 771Regex = Branch | Regex '|' Branch 772Branch = Atom | Branch Atom 773Atom = char | quoted_char | '.' | Bracket | Atom Dup | '(' Regex ')' | '^' | '$' 774Dup = '*' | '+' | '?' | '{' Count '}' | '{' Count ',}' | '{' Count ',' Count '}' 775 776(a*+?, ^*, $+, \X, {, (|a) are unspecified) 777*/ 778 779static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s) 780{ 781 int len, ere = ctx->cflags & REG_EXTENDED; 782 const char *p; 783 tre_ast_node_t *node; 784 wchar_t wc; 785 switch (*s) { 786 case '[': 787 return parse_bracket(ctx, s+1); 788 case '\\': 789 p = tre_expand_macro(s+1); 790 if (p) { 791 /* assume \X expansion is a single atom */ 792 reg_errcode_t err = parse_atom(ctx, p); 793 ctx->s = s+2; 794 return err; 795 } 796 /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */ 797 switch (*++s) { 798 case 0: 799 return REG_EESCAPE; 800 case 'b': 801 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1); 802 break; 803 case 'B': 804 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1); 805 break; 806 case '<': 807 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1); 808 break; 809 case '>': 810 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1); 811 break; 812 case 'x': 813 s++; 814 int i, v = 0, c; 815 len = 2; 816 if (*s == '{') { 817 len = 8; 818 s++; 819 } 820 for (i=0; i<len && v<0x110000; i++) { 821 c = hexval(s[i]); 822 if (c < 0) break; 823 v = 16*v + c; 824 } 825 s += i; 826 if (len == 8) { 827 if (*s != '}') 828 return REG_EBRACE; 829 s++; 830 } 831 node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++); 832 s--; 833 break; 834 case '{': 835 case '+': 836 case '?': 837 /* extension: treat \+, \? as repetitions in BRE */ 838 /* reject repetitions after empty expression in BRE */ 839 if (!ere) 840 return REG_BADRPT; 841 case '|': 842 /* extension: treat \| as alternation in BRE */ 843 if (!ere) { 844 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 845 s--; 846 goto end; 847 } 848 /* fallthrough */ 849 default: 850 if (!ere && (unsigned)*s-'1' < 9) { 851 /* back reference */ 852 int val = *s - '0'; 853 node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++); 854 ctx->max_backref = MAX(val, ctx->max_backref); 855 } else { 856 /* extension: accept unknown escaped char 857 as a literal */ 858 goto parse_literal; 859 } 860 } 861 s++; 862 break; 863 case '.': 864 if (ctx->cflags & REG_NEWLINE) { 865 tre_ast_node_t *tmp1, *tmp2; 866 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++); 867 tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++); 868 if (tmp1 && tmp2) 869 node = tre_ast_new_union(ctx->mem, tmp1, tmp2); 870 else 871 node = 0; 872 } else { 873 node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++); 874 } 875 s++; 876 break; 877 case '^': 878 /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */ 879 if (!ere && s != ctx->start) 880 goto parse_literal; 881 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1); 882 s++; 883 break; 884 case '$': 885 /* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */ 886 if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|'))) 887 goto parse_literal; 888 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1); 889 s++; 890 break; 891 case '*': 892 case '{': 893 case '+': 894 case '?': 895 /* reject repetitions after empty expression in ERE */ 896 if (ere) 897 return REG_BADRPT; 898 case '|': 899 if (!ere) 900 goto parse_literal; 901 case 0: 902 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 903 break; 904 default: 905parse_literal: 906 len = mbtowc(&wc, s, -1); 907 if (len < 0) 908 return REG_BADPAT; 909 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) { 910 tre_ast_node_t *tmp1, *tmp2; 911 /* multiple opposite case characters are not supported */ 912 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position); 913 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position); 914 if (tmp1 && tmp2) 915 node = tre_ast_new_union(ctx->mem, tmp1, tmp2); 916 else 917 node = 0; 918 } else { 919 node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position); 920 } 921 ctx->position++; 922 s += len; 923 break; 924 } 925end: 926 if (!node) 927 return REG_ESPACE; 928 ctx->n = node; 929 ctx->s = s; 930 return REG_OK; 931} 932 933#define PUSHPTR(err, s, v) do { \ 934 if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \ 935 return err; \ 936} while(0) 937 938#define PUSHINT(err, s, v) do { \ 939 if ((err = tre_stack_push_int(s, v)) != REG_OK) \ 940 return err; \ 941} while(0) 942 943static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) 944{ 945 tre_ast_node_t *nbranch=0, *nunion=0; 946 int ere = ctx->cflags & REG_EXTENDED; 947 const char *s = ctx->start; 948 int subid = 0; 949 int depth = 0; 950 reg_errcode_t err; 951 tre_stack_t *stack = ctx->stack; 952 953 PUSHINT(err, stack, subid++); 954 for (;;) { 955 if ((!ere && *s == '\\' && s[1] == '(') || 956 (ere && *s == '(')) { 957 PUSHPTR(err, stack, nunion); 958 PUSHPTR(err, stack, nbranch); 959 PUSHINT(err, stack, subid++); 960 s++; 961 if (!ere) 962 s++; 963 depth++; 964 nbranch = nunion = 0; 965 ctx->start = s; 966 continue; 967 } 968 if ((!ere && *s == '\\' && s[1] == ')') || 969 (ere && *s == ')' && depth)) { 970 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 971 if (!ctx->n) 972 return REG_ESPACE; 973 } else { 974 err = parse_atom(ctx, s); 975 if (err != REG_OK) 976 return err; 977 s = ctx->s; 978 } 979 980 parse_iter: 981 for (;;) { 982 int min, max; 983 984 if (*s!='\\' && *s!='*') { 985 if (!ere) 986 break; 987 if (*s!='+' && *s!='?' && *s!='{') 988 break; 989 } 990 if (*s=='\\' && ere) 991 break; 992 /* extension: treat \+, \? as repetitions in BRE */ 993 if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{') 994 break; 995 if (*s=='\\') 996 s++; 997 998 /* handle ^* at the start of a BRE. */ 999 if (!ere && s==ctx->start+1 && s[-1]=='^') 1000 break; 1001 1002 /* extension: multiple consecutive *+?{,} is unspecified, 1003 but (a+)+ has to be supported so accepting a++ makes 1004 sense, note however that the RE_DUP_MAX limit can be 1005 circumvented: (a{255}){255} uses a lot of memory.. */ 1006 if (*s=='{') { 1007 s = parse_dup(s+1, ere, &min, &max); 1008 if (!s) 1009 return REG_BADBR; 1010 } else { 1011 min=0; 1012 max=-1; 1013 if (*s == '+') 1014 min = 1; 1015 if (*s == '?') 1016 max = 1; 1017 s++; 1018 } 1019 if (max == 0) 1020 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1021 else 1022 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); 1023 if (!ctx->n) 1024 return REG_ESPACE; 1025 } 1026 1027 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n); 1028 if ((ere && *s == '|') || 1029 (ere && *s == ')' && depth) || 1030 (!ere && *s == '\\' && s[1] == ')') || 1031 /* extension: treat \| as alternation in BRE */ 1032 (!ere && *s == '\\' && s[1] == '|') || 1033 !*s) { 1034 /* extension: empty branch is unspecified (), (|a), (a|) 1035 here they are not rejected but match on empty string */ 1036 int c = *s; 1037 nunion = tre_ast_new_union(ctx->mem, nunion, nbranch); 1038 nbranch = 0; 1039 1040 if (c == '\\' && s[1] == '|') { 1041 s+=2; 1042 ctx->start = s; 1043 } else if (c == '|') { 1044 s++; 1045 ctx->start = s; 1046 } else { 1047 if (c == '\\') { 1048 if (!depth) return REG_EPAREN; 1049 s+=2; 1050 } else if (c == ')') 1051 s++; 1052 depth--; 1053 err = marksub(ctx, nunion, tre_stack_pop_int(stack)); 1054 if (err != REG_OK) 1055 return err; 1056 if (!c && depth<0) { 1057 ctx->submatch_id = subid; 1058 return REG_OK; 1059 } 1060 if (!c || depth<0) 1061 return REG_EPAREN; 1062 nbranch = tre_stack_pop_voidptr(stack); 1063 nunion = tre_stack_pop_voidptr(stack); 1064 goto parse_iter; 1065 } 1066 } 1067 } 1068} 1069 1070 1071/*********************************************************************** 1072 from tre-compile.c 1073***********************************************************************/ 1074 1075 1076/* 1077 TODO: 1078 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive 1079 function calls. 1080*/ 1081 1082/* 1083 Algorithms to setup tags so that submatch addressing can be done. 1084*/ 1085 1086 1087/* Inserts a catenation node to the root of the tree given in `node'. 1088 As the left child a new tag with number `tag_id' to `node' is added, 1089 and the right child is the old root. */ 1090static reg_errcode_t 1091tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id) 1092{ 1093 tre_catenation_t *c; 1094 1095 c = tre_mem_alloc(mem, sizeof(*c)); 1096 if (c == NULL) 1097 return REG_ESPACE; 1098 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1); 1099 if (c->left == NULL) 1100 return REG_ESPACE; 1101 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); 1102 if (c->right == NULL) 1103 return REG_ESPACE; 1104 1105 c->right->obj = node->obj; 1106 c->right->type = node->type; 1107 c->right->nullable = -1; 1108 c->right->submatch_id = -1; 1109 c->right->firstpos = NULL; 1110 c->right->lastpos = NULL; 1111 c->right->num_tags = 0; 1112 c->right->num_submatches = 0; 1113 node->obj = c; 1114 node->type = CATENATION; 1115 return REG_OK; 1116} 1117 1118/* Inserts a catenation node to the root of the tree given in `node'. 1119 As the right child a new tag with number `tag_id' to `node' is added, 1120 and the left child is the old root. */ 1121static reg_errcode_t 1122tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id) 1123{ 1124 tre_catenation_t *c; 1125 1126 c = tre_mem_alloc(mem, sizeof(*c)); 1127 if (c == NULL) 1128 return REG_ESPACE; 1129 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1); 1130 if (c->right == NULL) 1131 return REG_ESPACE; 1132 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); 1133 if (c->left == NULL) 1134 return REG_ESPACE; 1135 1136 c->left->obj = node->obj; 1137 c->left->type = node->type; 1138 c->left->nullable = -1; 1139 c->left->submatch_id = -1; 1140 c->left->firstpos = NULL; 1141 c->left->lastpos = NULL; 1142 c->left->num_tags = 0; 1143 c->left->num_submatches = 0; 1144 node->obj = c; 1145 node->type = CATENATION; 1146 return REG_OK; 1147} 1148 1149typedef enum { 1150 ADDTAGS_RECURSE, 1151 ADDTAGS_AFTER_ITERATION, 1152 ADDTAGS_AFTER_UNION_LEFT, 1153 ADDTAGS_AFTER_UNION_RIGHT, 1154 ADDTAGS_AFTER_CAT_LEFT, 1155 ADDTAGS_AFTER_CAT_RIGHT, 1156 ADDTAGS_SET_SUBMATCH_END 1157} tre_addtags_symbol_t; 1158 1159 1160typedef struct { 1161 int tag; 1162 int next_tag; 1163} tre_tag_states_t; 1164 1165 1166/* Go through `regset' and set submatch data for submatches that are 1167 using this tag. */ 1168static void 1169tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag) 1170{ 1171 int i; 1172 1173 for (i = 0; regset[i] >= 0; i++) 1174 { 1175 int id = regset[i] / 2; 1176 int start = !(regset[i] % 2); 1177 if (start) 1178 tnfa->submatch_data[id].so_tag = tag; 1179 else 1180 tnfa->submatch_data[id].eo_tag = tag; 1181 } 1182 regset[0] = -1; 1183} 1184 1185 1186/* Adds tags to appropriate locations in the parse tree in `tree', so that 1187 subexpressions marked for submatch addressing can be traced. */ 1188static reg_errcode_t 1189tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, 1190 tre_tnfa_t *tnfa) 1191{ 1192 reg_errcode_t status = REG_OK; 1193 tre_addtags_symbol_t symbol; 1194 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */ 1195 int bottom = tre_stack_num_objects(stack); 1196 /* True for first pass (counting number of needed tags) */ 1197 int first_pass = (mem == NULL || tnfa == NULL); 1198 int *regset, *orig_regset; 1199 int num_tags = 0; /* Total number of tags. */ 1200 int num_minimals = 0; /* Number of special minimal tags. */ 1201 int tag = 0; /* The tag that is to be added next. */ 1202 int next_tag = 1; /* Next tag to use after this one. */ 1203 int *parents; /* Stack of submatches the current submatch is 1204 contained in. */ 1205 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */ 1206 tre_tag_states_t *saved_states; 1207 1208 tre_tag_direction_t direction = TRE_TAG_MINIMIZE; 1209 if (!first_pass) 1210 { 1211 tnfa->end_tag = 0; 1212 tnfa->minimal_tags[0] = -1; 1213 } 1214 1215 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2)); 1216 if (regset == NULL) 1217 return REG_ESPACE; 1218 regset[0] = -1; 1219 orig_regset = regset; 1220 1221 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1)); 1222 if (parents == NULL) 1223 { 1224 xfree(regset); 1225 return REG_ESPACE; 1226 } 1227 parents[0] = -1; 1228 1229 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1)); 1230 if (saved_states == NULL) 1231 { 1232 xfree(regset); 1233 xfree(parents); 1234 return REG_ESPACE; 1235 } 1236 else 1237 { 1238 unsigned int i; 1239 for (i = 0; i <= tnfa->num_submatches; i++) 1240 saved_states[i].tag = -1; 1241 } 1242 1243 STACK_PUSH(stack, voidptr, node); 1244 STACK_PUSH(stack, int, ADDTAGS_RECURSE); 1245 1246 while (tre_stack_num_objects(stack) > bottom) 1247 { 1248 if (status != REG_OK) 1249 break; 1250 1251 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack); 1252 switch (symbol) 1253 { 1254 1255 case ADDTAGS_SET_SUBMATCH_END: 1256 { 1257 int id = tre_stack_pop_int(stack); 1258 int i; 1259 1260 /* Add end of this submatch to regset. */ 1261 for (i = 0; regset[i] >= 0; i++); 1262 regset[i] = id * 2 + 1; 1263 regset[i + 1] = -1; 1264 1265 /* Pop this submatch from the parents stack. */ 1266 for (i = 0; parents[i] >= 0; i++); 1267 parents[i - 1] = -1; 1268 break; 1269 } 1270 1271 case ADDTAGS_RECURSE: 1272 node = tre_stack_pop_voidptr(stack); 1273 1274 if (node->submatch_id >= 0) 1275 { 1276 int id = node->submatch_id; 1277 int i; 1278 1279 1280 /* Add start of this submatch to regset. */ 1281 for (i = 0; regset[i] >= 0; i++); 1282 regset[i] = id * 2; 1283 regset[i + 1] = -1; 1284 1285 if (!first_pass) 1286 { 1287 for (i = 0; parents[i] >= 0; i++); 1288 tnfa->submatch_data[id].parents = NULL; 1289 if (i > 0) 1290 { 1291 int *p = xmalloc(sizeof(*p) * (i + 1)); 1292 if (p == NULL) 1293 { 1294 status = REG_ESPACE; 1295 break; 1296 } 1297 assert(tnfa->submatch_data[id].parents == NULL); 1298 tnfa->submatch_data[id].parents = p; 1299 for (i = 0; parents[i] >= 0; i++) 1300 p[i] = parents[i]; 1301 p[i] = -1; 1302 } 1303 } 1304 1305 /* Add end of this submatch to regset after processing this 1306 node. */ 1307 STACK_PUSHX(stack, int, node->submatch_id); 1308 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END); 1309 } 1310 1311 switch (node->type) 1312 { 1313 case LITERAL: 1314 { 1315 tre_literal_t *lit = node->obj; 1316 1317 if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) 1318 { 1319 int i; 1320 if (regset[0] >= 0) 1321 { 1322 /* Regset is not empty, so add a tag before the 1323 literal or backref. */ 1324 if (!first_pass) 1325 { 1326 status = tre_add_tag_left(mem, node, tag); 1327 tnfa->tag_directions[tag] = direction; 1328 if (minimal_tag >= 0) 1329 { 1330 for (i = 0; tnfa->minimal_tags[i] >= 0; i++); 1331 tnfa->minimal_tags[i] = tag; 1332 tnfa->minimal_tags[i + 1] = minimal_tag; 1333 tnfa->minimal_tags[i + 2] = -1; 1334 minimal_tag = -1; 1335 num_minimals++; 1336 } 1337 tre_purge_regset(regset, tnfa, tag); 1338 } 1339 else 1340 { 1341 node->num_tags = 1; 1342 } 1343 1344 regset[0] = -1; 1345 tag = next_tag; 1346 num_tags++; 1347 next_tag++; 1348 } 1349 } 1350 else 1351 { 1352 assert(!IS_TAG(lit)); 1353 } 1354 break; 1355 } 1356 case CATENATION: 1357 { 1358 tre_catenation_t *cat = node->obj; 1359 tre_ast_node_t *left = cat->left; 1360 tre_ast_node_t *right = cat->right; 1361 int reserved_tag = -1; 1362 1363 1364 /* After processing right child. */ 1365 STACK_PUSHX(stack, voidptr, node); 1366 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT); 1367 1368 /* Process right child. */ 1369 STACK_PUSHX(stack, voidptr, right); 1370 STACK_PUSHX(stack, int, ADDTAGS_RECURSE); 1371 1372 /* After processing left child. */ 1373 STACK_PUSHX(stack, int, next_tag + left->num_tags); 1374 if (left->num_tags > 0 && right->num_tags > 0) 1375 { 1376 /* Reserve the next tag to the right child. */ 1377 reserved_tag = next_tag; 1378 next_tag++; 1379 } 1380 STACK_PUSHX(stack, int, reserved_tag); 1381 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT); 1382 1383 /* Process left child. */ 1384 STACK_PUSHX(stack, voidptr, left); 1385 STACK_PUSHX(stack, int, ADDTAGS_RECURSE); 1386 1387 } 1388 break; 1389 case ITERATION: 1390 { 1391 tre_iteration_t *iter = node->obj; 1392 1393 if (first_pass) 1394 { 1395 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal); 1396 } 1397 else 1398 { 1399 STACK_PUSHX(stack, int, tag); 1400 STACK_PUSHX(stack, int, iter->minimal); 1401 } 1402 STACK_PUSHX(stack, voidptr, node); 1403 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION); 1404 1405 STACK_PUSHX(stack, voidptr, iter->arg); 1406 STACK_PUSHX(stack, int, ADDTAGS_RECURSE); 1407 1408 /* Regset is not empty, so add a tag here. */ 1409 if (regset[0] >= 0 || iter->minimal) 1410 { 1411 if (!first_pass) 1412 { 1413 int i; 1414 status = tre_add_tag_left(mem, node, tag); 1415 if (iter->minimal) 1416 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; 1417 else 1418 tnfa->tag_directions[tag] = direction; 1419 if (minimal_tag >= 0) 1420 { 1421 for (i = 0; tnfa->minimal_tags[i] >= 0; i++); 1422 tnfa->minimal_tags[i] = tag; 1423 tnfa->minimal_tags[i + 1] = minimal_tag; 1424 tnfa->minimal_tags[i + 2] = -1; 1425 minimal_tag = -1; 1426 num_minimals++; 1427 } 1428 tre_purge_regset(regset, tnfa, tag); 1429 } 1430 1431 regset[0] = -1; 1432 tag = next_tag; 1433 num_tags++; 1434 next_tag++; 1435 } 1436 direction = TRE_TAG_MINIMIZE; 1437 } 1438 break; 1439 case UNION: 1440 { 1441 tre_union_t *uni = node->obj; 1442 tre_ast_node_t *left = uni->left; 1443 tre_ast_node_t *right = uni->right; 1444 int left_tag; 1445 int right_tag; 1446 1447 if (regset[0] >= 0) 1448 { 1449 left_tag = next_tag; 1450 right_tag = next_tag + 1; 1451 } 1452 else 1453 { 1454 left_tag = tag; 1455 right_tag = next_tag; 1456 } 1457 1458 /* After processing right child. */ 1459 STACK_PUSHX(stack, int, right_tag); 1460 STACK_PUSHX(stack, int, left_tag); 1461 STACK_PUSHX(stack, voidptr, regset); 1462 STACK_PUSHX(stack, int, regset[0] >= 0); 1463 STACK_PUSHX(stack, voidptr, node); 1464 STACK_PUSHX(stack, voidptr, right); 1465 STACK_PUSHX(stack, voidptr, left); 1466 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT); 1467 1468 /* Process right child. */ 1469 STACK_PUSHX(stack, voidptr, right); 1470 STACK_PUSHX(stack, int, ADDTAGS_RECURSE); 1471 1472 /* After processing left child. */ 1473 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT); 1474 1475 /* Process left child. */ 1476 STACK_PUSHX(stack, voidptr, left); 1477 STACK_PUSHX(stack, int, ADDTAGS_RECURSE); 1478 1479 /* Regset is not empty, so add a tag here. */ 1480 if (regset[0] >= 0) 1481 { 1482 if (!first_pass) 1483 { 1484 int i; 1485 status = tre_add_tag_left(mem, node, tag); 1486 tnfa->tag_directions[tag] = direction; 1487 if (minimal_tag >= 0) 1488 { 1489 for (i = 0; tnfa->minimal_tags[i] >= 0; i++); 1490 tnfa->minimal_tags[i] = tag; 1491 tnfa->minimal_tags[i + 1] = minimal_tag; 1492 tnfa->minimal_tags[i + 2] = -1; 1493 minimal_tag = -1; 1494 num_minimals++; 1495 } 1496 tre_purge_regset(regset, tnfa, tag); 1497 } 1498 1499 regset[0] = -1; 1500 tag = next_tag; 1501 num_tags++; 1502 next_tag++; 1503 } 1504 1505 if (node->num_submatches > 0) 1506 { 1507 /* The next two tags are reserved for markers. */ 1508 next_tag++; 1509 tag = next_tag; 1510 next_tag++; 1511 } 1512 1513 break; 1514 } 1515 } 1516 1517 if (node->submatch_id >= 0) 1518 { 1519 int i; 1520 /* Push this submatch on the parents stack. */ 1521 for (i = 0; parents[i] >= 0; i++); 1522 parents[i] = node->submatch_id; 1523 parents[i + 1] = -1; 1524 } 1525 1526 break; /* end case: ADDTAGS_RECURSE */ 1527 1528 case ADDTAGS_AFTER_ITERATION: 1529 { 1530 int minimal = 0; 1531 int enter_tag; 1532 node = tre_stack_pop_voidptr(stack); 1533 if (first_pass) 1534 { 1535 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags 1536 + tre_stack_pop_int(stack); 1537 minimal_tag = -1; 1538 } 1539 else 1540 { 1541 minimal = tre_stack_pop_int(stack); 1542 enter_tag = tre_stack_pop_int(stack); 1543 if (minimal) 1544 minimal_tag = enter_tag; 1545 } 1546 1547 if (!first_pass) 1548 { 1549 if (minimal) 1550 direction = TRE_TAG_MINIMIZE; 1551 else 1552 direction = TRE_TAG_MAXIMIZE; 1553 } 1554 break; 1555 } 1556 1557 case ADDTAGS_AFTER_CAT_LEFT: 1558 { 1559 int new_tag = tre_stack_pop_int(stack); 1560 next_tag = tre_stack_pop_int(stack); 1561 if (new_tag >= 0) 1562 { 1563 tag = new_tag; 1564 } 1565 break; 1566 } 1567 1568 case ADDTAGS_AFTER_CAT_RIGHT: 1569 node = tre_stack_pop_voidptr(stack); 1570 if (first_pass) 1571 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags 1572 + ((tre_catenation_t *)node->obj)->right->num_tags; 1573 break; 1574 1575 case ADDTAGS_AFTER_UNION_LEFT: 1576 /* Lift the bottom of the `regset' array so that when processing 1577 the right operand the items currently in the array are 1578 invisible. The original bottom was saved at ADDTAGS_UNION and 1579 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */ 1580 while (*regset >= 0) 1581 regset++; 1582 break; 1583 1584 case ADDTAGS_AFTER_UNION_RIGHT: 1585 { 1586 int added_tags, tag_left, tag_right; 1587 tre_ast_node_t *left = tre_stack_pop_voidptr(stack); 1588 tre_ast_node_t *right = tre_stack_pop_voidptr(stack); 1589 node = tre_stack_pop_voidptr(stack); 1590 added_tags = tre_stack_pop_int(stack); 1591 if (first_pass) 1592 { 1593 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags 1594 + ((tre_union_t *)node->obj)->right->num_tags + added_tags 1595 + ((node->num_submatches > 0) ? 2 : 0); 1596 } 1597 regset = tre_stack_pop_voidptr(stack); 1598 tag_left = tre_stack_pop_int(stack); 1599 tag_right = tre_stack_pop_int(stack); 1600 1601 /* Add tags after both children, the left child gets a smaller 1602 tag than the right child. This guarantees that we prefer 1603 the left child over the right child. */ 1604 /* XXX - This is not always necessary (if the children have 1605 tags which must be seen for every match of that child). */ 1606 /* XXX - Check if this is the only place where tre_add_tag_right 1607 is used. If so, use tre_add_tag_left (putting the tag before 1608 the child as opposed after the child) and throw away 1609 tre_add_tag_right. */ 1610 if (node->num_submatches > 0) 1611 { 1612 if (!first_pass) 1613 { 1614 status = tre_add_tag_right(mem, left, tag_left); 1615 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE; 1616 if (status == REG_OK) 1617 status = tre_add_tag_right(mem, right, tag_right); 1618 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE; 1619 } 1620 num_tags += 2; 1621 } 1622 direction = TRE_TAG_MAXIMIZE; 1623 break; 1624 } 1625 1626 default: 1627 assert(0); 1628 break; 1629 1630 } /* end switch(symbol) */ 1631 } /* end while(tre_stack_num_objects(stack) > bottom) */ 1632 1633 if (!first_pass) 1634 tre_purge_regset(regset, tnfa, tag); 1635 1636 if (!first_pass && minimal_tag >= 0) 1637 { 1638 int i; 1639 for (i = 0; tnfa->minimal_tags[i] >= 0; i++); 1640 tnfa->minimal_tags[i] = tag; 1641 tnfa->minimal_tags[i + 1] = minimal_tag; 1642 tnfa->minimal_tags[i + 2] = -1; 1643 minimal_tag = -1; 1644 num_minimals++; 1645 } 1646 1647 assert(tree->num_tags == num_tags); 1648 tnfa->end_tag = num_tags; 1649 tnfa->num_tags = num_tags; 1650 tnfa->num_minimals = num_minimals; 1651 xfree(orig_regset); 1652 xfree(parents); 1653 xfree(saved_states); 1654 return status; 1655} 1656 1657 1658 1659/* 1660 AST to TNFA compilation routines. 1661*/ 1662 1663typedef enum { 1664 COPY_RECURSE, 1665 COPY_SET_RESULT_PTR 1666} tre_copyast_symbol_t; 1667 1668/* Flags for tre_copy_ast(). */ 1669#define COPY_REMOVE_TAGS 1 1670#define COPY_MAXIMIZE_FIRST_TAG 2 1671 1672static reg_errcode_t 1673tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, 1674 int flags, int *pos_add, tre_tag_direction_t *tag_directions, 1675 tre_ast_node_t **copy, int *max_pos) 1676{ 1677 reg_errcode_t status = REG_OK; 1678 int bottom = tre_stack_num_objects(stack); 1679 int num_copied = 0; 1680 int first_tag = 1; 1681 tre_ast_node_t **result = copy; 1682 tre_copyast_symbol_t symbol; 1683 1684 STACK_PUSH(stack, voidptr, ast); 1685 STACK_PUSH(stack, int, COPY_RECURSE); 1686 1687 while (status == REG_OK && tre_stack_num_objects(stack) > bottom) 1688 { 1689 tre_ast_node_t *node; 1690 if (status != REG_OK) 1691 break; 1692 1693 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack); 1694 switch (symbol) 1695 { 1696 case COPY_SET_RESULT_PTR: 1697 result = tre_stack_pop_voidptr(stack); 1698 break; 1699 case COPY_RECURSE: 1700 node = tre_stack_pop_voidptr(stack); 1701 switch (node->type) 1702 { 1703 case LITERAL: 1704 { 1705 tre_literal_t *lit = node->obj; 1706 int pos = lit->position; 1707 int min = lit->code_min; 1708 int max = lit->code_max; 1709 if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) 1710 { 1711 /* XXX - e.g. [ab] has only one position but two 1712 nodes, so we are creating holes in the state space 1713 here. Not fatal, just wastes memory. */ 1714 pos += *pos_add; 1715 num_copied++; 1716 } 1717 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS)) 1718 { 1719 /* Change this tag to empty. */ 1720 min = EMPTY; 1721 max = pos = -1; 1722 } 1723 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG) 1724 && first_tag) 1725 { 1726 /* Maximize the first tag. */ 1727 tag_directions[max] = TRE_TAG_MAXIMIZE; 1728 first_tag = 0; 1729 } 1730 *result = tre_ast_new_literal(mem, min, max, pos); 1731 if (*result == NULL) 1732 status = REG_ESPACE; 1733 else { 1734 tre_literal_t *p = (*result)->obj; 1735 p->class = lit->class; 1736 p->neg_classes = lit->neg_classes; 1737 } 1738 1739 if (pos > *max_pos) 1740 *max_pos = pos; 1741 break; 1742 } 1743 case UNION: 1744 { 1745 tre_union_t *uni = node->obj; 1746 tre_union_t *tmp; 1747 *result = tre_ast_new_union(mem, uni->left, uni->right); 1748 if (*result == NULL) 1749 { 1750 status = REG_ESPACE; 1751 break; 1752 } 1753 tmp = (*result)->obj; 1754 result = &tmp->left; 1755 STACK_PUSHX(stack, voidptr, uni->right); 1756 STACK_PUSHX(stack, int, COPY_RECURSE); 1757 STACK_PUSHX(stack, voidptr, &tmp->right); 1758 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); 1759 STACK_PUSHX(stack, voidptr, uni->left); 1760 STACK_PUSHX(stack, int, COPY_RECURSE); 1761 break; 1762 } 1763 case CATENATION: 1764 { 1765 tre_catenation_t *cat = node->obj; 1766 tre_catenation_t *tmp; 1767 *result = tre_ast_new_catenation(mem, cat->left, cat->right); 1768 if (*result == NULL) 1769 { 1770 status = REG_ESPACE; 1771 break; 1772 } 1773 tmp = (*result)->obj; 1774 tmp->left = NULL; 1775 tmp->right = NULL; 1776 result = &tmp->left; 1777 1778 STACK_PUSHX(stack, voidptr, cat->right); 1779 STACK_PUSHX(stack, int, COPY_RECURSE); 1780 STACK_PUSHX(stack, voidptr, &tmp->right); 1781 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); 1782 STACK_PUSHX(stack, voidptr, cat->left); 1783 STACK_PUSHX(stack, int, COPY_RECURSE); 1784 break; 1785 } 1786 case ITERATION: 1787 { 1788 tre_iteration_t *iter = node->obj; 1789 STACK_PUSHX(stack, voidptr, iter->arg); 1790 STACK_PUSHX(stack, int, COPY_RECURSE); 1791 *result = tre_ast_new_iter(mem, iter->arg, iter->min, 1792 iter->max, iter->minimal); 1793 if (*result == NULL) 1794 { 1795 status = REG_ESPACE; 1796 break; 1797 } 1798 iter = (*result)->obj; 1799 result = &iter->arg; 1800 break; 1801 } 1802 default: 1803 assert(0); 1804 break; 1805 } 1806 break; 1807 } 1808 } 1809 *pos_add += num_copied; 1810 return status; 1811} 1812 1813typedef enum { 1814 EXPAND_RECURSE, 1815 EXPAND_AFTER_ITER 1816} tre_expand_ast_symbol_t; 1817 1818/* Expands each iteration node that has a finite nonzero minimum or maximum 1819 iteration count to a catenated sequence of copies of the node. */ 1820static reg_errcode_t 1821tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, 1822 int *position, tre_tag_direction_t *tag_directions) 1823{ 1824 reg_errcode_t status = REG_OK; 1825 int bottom = tre_stack_num_objects(stack); 1826 int pos_add = 0; 1827 int pos_add_total = 0; 1828 int max_pos = 0; 1829 int iter_depth = 0; 1830 1831 STACK_PUSHR(stack, voidptr, ast); 1832 STACK_PUSHR(stack, int, EXPAND_RECURSE); 1833 while (status == REG_OK && tre_stack_num_objects(stack) > bottom) 1834 { 1835 tre_ast_node_t *node; 1836 tre_expand_ast_symbol_t symbol; 1837 1838 if (status != REG_OK) 1839 break; 1840 1841 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack); 1842 node = tre_stack_pop_voidptr(stack); 1843 switch (symbol) 1844 { 1845 case EXPAND_RECURSE: 1846 switch (node->type) 1847 { 1848 case LITERAL: 1849 { 1850 tre_literal_t *lit= node->obj; 1851 if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) 1852 { 1853 lit->position += pos_add; 1854 if (lit->position > max_pos) 1855 max_pos = lit->position; 1856 } 1857 break; 1858 } 1859 case UNION: 1860 { 1861 tre_union_t *uni = node->obj; 1862 STACK_PUSHX(stack, voidptr, uni->right); 1863 STACK_PUSHX(stack, int, EXPAND_RECURSE); 1864 STACK_PUSHX(stack, voidptr, uni->left); 1865 STACK_PUSHX(stack, int, EXPAND_RECURSE); 1866 break; 1867 } 1868 case CATENATION: 1869 { 1870 tre_catenation_t *cat = node->obj; 1871 STACK_PUSHX(stack, voidptr, cat->right); 1872 STACK_PUSHX(stack, int, EXPAND_RECURSE); 1873 STACK_PUSHX(stack, voidptr, cat->left); 1874 STACK_PUSHX(stack, int, EXPAND_RECURSE); 1875 break; 1876 } 1877 case ITERATION: 1878 { 1879 tre_iteration_t *iter = node->obj; 1880 STACK_PUSHX(stack, int, pos_add); 1881 STACK_PUSHX(stack, voidptr, node); 1882 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER); 1883 STACK_PUSHX(stack, voidptr, iter->arg); 1884 STACK_PUSHX(stack, int, EXPAND_RECURSE); 1885 /* If we are going to expand this node at EXPAND_AFTER_ITER 1886 then don't increase the `pos' fields of the nodes now, it 1887 will get done when expanding. */ 1888 if (iter->min > 1 || iter->max > 1) 1889 pos_add = 0; 1890 iter_depth++; 1891 break; 1892 } 1893 default: 1894 assert(0); 1895 break; 1896 } 1897 break; 1898 case EXPAND_AFTER_ITER: 1899 { 1900 tre_iteration_t *iter = node->obj; 1901 int pos_add_last; 1902 pos_add = tre_stack_pop_int(stack); 1903 pos_add_last = pos_add; 1904 if (iter->min > 1 || iter->max > 1) 1905 { 1906 tre_ast_node_t *seq1 = NULL, *seq2 = NULL; 1907 int j; 1908 int pos_add_save = pos_add; 1909 1910 /* Create a catenated sequence of copies of the node. */ 1911 for (j = 0; j < iter->min; j++) 1912 { 1913 tre_ast_node_t *copy; 1914 /* Remove tags from all but the last copy. */ 1915 int flags = ((j + 1 < iter->min) 1916 ? COPY_REMOVE_TAGS 1917 : COPY_MAXIMIZE_FIRST_TAG); 1918 pos_add_save = pos_add; 1919 status = tre_copy_ast(mem, stack, iter->arg, flags, 1920 &pos_add, tag_directions, ©, 1921 &max_pos); 1922 if (status != REG_OK) 1923 return status; 1924 if (seq1 != NULL) 1925 seq1 = tre_ast_new_catenation(mem, seq1, copy); 1926 else 1927 seq1 = copy; 1928 if (seq1 == NULL) 1929 return REG_ESPACE; 1930 } 1931 1932 if (iter->max == -1) 1933 { 1934 /* No upper limit. */ 1935 pos_add_save = pos_add; 1936 status = tre_copy_ast(mem, stack, iter->arg, 0, 1937 &pos_add, NULL, &seq2, &max_pos); 1938 if (status != REG_OK) 1939 return status; 1940 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0); 1941 if (seq2 == NULL) 1942 return REG_ESPACE; 1943 } 1944 else 1945 { 1946 for (j = iter->min; j < iter->max; j++) 1947 { 1948 tre_ast_node_t *tmp, *copy; 1949 pos_add_save = pos_add; 1950 status = tre_copy_ast(mem, stack, iter->arg, 0, 1951 &pos_add, NULL, ©, &max_pos); 1952 if (status != REG_OK) 1953 return status; 1954 if (seq2 != NULL) 1955 seq2 = tre_ast_new_catenation(mem, copy, seq2); 1956 else 1957 seq2 = copy; 1958 if (seq2 == NULL) 1959 return REG_ESPACE; 1960 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1); 1961 if (tmp == NULL) 1962 return REG_ESPACE; 1963 seq2 = tre_ast_new_union(mem, tmp, seq2); 1964 if (seq2 == NULL) 1965 return REG_ESPACE; 1966 } 1967 } 1968 1969 pos_add = pos_add_save; 1970 if (seq1 == NULL) 1971 seq1 = seq2; 1972 else if (seq2 != NULL) 1973 seq1 = tre_ast_new_catenation(mem, seq1, seq2); 1974 if (seq1 == NULL) 1975 return REG_ESPACE; 1976 node->obj = seq1->obj; 1977 node->type = seq1->type; 1978 } 1979 1980 iter_depth--; 1981 pos_add_total += pos_add - pos_add_last; 1982 if (iter_depth == 0) 1983 pos_add = pos_add_total; 1984 1985 break; 1986 } 1987 default: 1988 assert(0); 1989 break; 1990 } 1991 } 1992 1993 *position += pos_add_total; 1994 1995 /* `max_pos' should never be larger than `*position' if the above 1996 code works, but just an extra safeguard let's make sure 1997 `*position' is set large enough so enough memory will be 1998 allocated for the transition table. */ 1999 if (max_pos > *position) 2000 *position = max_pos; 2001 2002 return status; 2003} 2004 2005static tre_pos_and_tags_t * 2006tre_set_empty(tre_mem_t mem) 2007{ 2008 tre_pos_and_tags_t *new_set; 2009 2010 new_set = tre_mem_calloc(mem, sizeof(*new_set)); 2011 if (new_set == NULL) 2012 return NULL; 2013 2014 new_set[0].position = -1; 2015 new_set[0].code_min = -1; 2016 new_set[0].code_max = -1; 2017 2018 return new_set; 2019} 2020 2021static tre_pos_and_tags_t * 2022tre_set_one(tre_mem_t mem, int position, int code_min, int code_max, 2023 tre_ctype_t class, tre_ctype_t *neg_classes, int backref) 2024{ 2025 tre_pos_and_tags_t *new_set; 2026 2027 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2); 2028 if (new_set == NULL) 2029 return NULL; 2030 2031 new_set[0].position = position; 2032 new_set[0].code_min = code_min; 2033 new_set[0].code_max = code_max; 2034 new_set[0].class = class; 2035 new_set[0].neg_classes = neg_classes; 2036 new_set[0].backref = backref; 2037 new_set[1].position = -1; 2038 new_set[1].code_min = -1; 2039 new_set[1].code_max = -1; 2040 2041 return new_set; 2042} 2043 2044static tre_pos_and_tags_t * 2045tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2, 2046 int *tags, int assertions) 2047{ 2048 int s1, s2, i, j; 2049 tre_pos_and_tags_t *new_set; 2050 int *new_tags; 2051 int num_tags; 2052 2053 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++); 2054 for (s1 = 0; set1[s1].position >= 0; s1++); 2055 for (s2 = 0; set2[s2].position >= 0; s2++); 2056 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1)); 2057 if (!new_set ) 2058 return NULL; 2059 2060 for (s1 = 0; set1[s1].position >= 0; s1++) 2061 { 2062 new_set[s1].position = set1[s1].position; 2063 new_set[s1].code_min = set1[s1].code_min; 2064 new_set[s1].code_max = set1[s1].code_max; 2065 new_set[s1].assertions = set1[s1].assertions | assertions; 2066 new_set[s1].class = set1[s1].class; 2067 new_set[s1].neg_classes = set1[s1].neg_classes; 2068 new_set[s1].backref = set1[s1].backref; 2069 if (set1[s1].tags == NULL && tags == NULL) 2070 new_set[s1].tags = NULL; 2071 else 2072 { 2073 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++); 2074 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags) 2075 * (i + num_tags + 1))); 2076 if (new_tags == NULL) 2077 return NULL; 2078 for (j = 0; j < i; j++) 2079 new_tags[j] = set1[s1].tags[j]; 2080 for (i = 0; i < num_tags; i++) 2081 new_tags[j + i] = tags[i]; 2082 new_tags[j + i] = -1; 2083 new_set[s1].tags = new_tags; 2084 } 2085 } 2086 2087 for (s2 = 0; set2[s2].position >= 0; s2++) 2088 { 2089 new_set[s1 + s2].position = set2[s2].position; 2090 new_set[s1 + s2].code_min = set2[s2].code_min; 2091 new_set[s1 + s2].code_max = set2[s2].code_max; 2092 /* XXX - why not | assertions here as well? */ 2093 new_set[s1 + s2].assertions = set2[s2].assertions; 2094 new_set[s1 + s2].class = set2[s2].class; 2095 new_set[s1 + s2].neg_classes = set2[s2].neg_classes; 2096 new_set[s1 + s2].backref = set2[s2].backref; 2097 if (set2[s2].tags == NULL) 2098 new_set[s1 + s2].tags = NULL; 2099 else 2100 { 2101 for (i = 0; set2[s2].tags[i] >= 0; i++); 2102 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1)); 2103 if (new_tags == NULL) 2104 return NULL; 2105 for (j = 0; j < i; j++) 2106 new_tags[j] = set2[s2].tags[j]; 2107 new_tags[j] = -1; 2108 new_set[s1 + s2].tags = new_tags; 2109 } 2110 } 2111 new_set[s1 + s2].position = -1; 2112 return new_set; 2113} 2114 2115/* Finds the empty path through `node' which is the one that should be 2116 taken according to POSIX.2 rules, and adds the tags on that path to 2117 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is 2118 set to the number of tags seen on the path. */ 2119static reg_errcode_t 2120tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, 2121 int *assertions, int *num_tags_seen) 2122{ 2123 tre_literal_t *lit; 2124 tre_union_t *uni; 2125 tre_catenation_t *cat; 2126 tre_iteration_t *iter; 2127 int i; 2128 int bottom = tre_stack_num_objects(stack); 2129 reg_errcode_t status = REG_OK; 2130 if (num_tags_seen) 2131 *num_tags_seen = 0; 2132 2133 status = tre_stack_push_voidptr(stack, node); 2134 2135 /* Walk through the tree recursively. */ 2136 while (status == REG_OK && tre_stack_num_objects(stack) > bottom) 2137 { 2138 node = tre_stack_pop_voidptr(stack); 2139 2140 switch (node->type) 2141 { 2142 case LITERAL: 2143 lit = (tre_literal_t *)node->obj; 2144 switch (lit->code_min) 2145 { 2146 case TAG: 2147 if (lit->code_max >= 0) 2148 { 2149 if (tags != NULL) 2150 { 2151 /* Add the tag to `tags'. */ 2152 for (i = 0; tags[i] >= 0; i++) 2153 if (tags[i] == lit->code_max) 2154 break; 2155 if (tags[i] < 0) 2156 { 2157 tags[i] = lit->code_max; 2158 tags[i + 1] = -1; 2159 } 2160 } 2161 if (num_tags_seen) 2162 (*num_tags_seen)++; 2163 } 2164 break; 2165 case ASSERTION: 2166 assert(lit->code_max >= 1 2167 || lit->code_max <= ASSERT_LAST); 2168 if (assertions != NULL) 2169 *assertions |= lit->code_max; 2170 break; 2171 case EMPTY: 2172 break; 2173 default: 2174 assert(0); 2175 break; 2176 } 2177 break; 2178 2179 case UNION: 2180 /* Subexpressions starting earlier take priority over ones 2181 starting later, so we prefer the left subexpression over the 2182 right subexpression. */ 2183 uni = (tre_union_t *)node->obj; 2184 if (uni->left->nullable) 2185 STACK_PUSHX(stack, voidptr, uni->left) 2186 else if (uni->right->nullable) 2187 STACK_PUSHX(stack, voidptr, uni->right) 2188 else 2189 assert(0); 2190 break; 2191 2192 case CATENATION: 2193 /* The path must go through both children. */ 2194 cat = (tre_catenation_t *)node->obj; 2195 assert(cat->left->nullable); 2196 assert(cat->right->nullable); 2197 STACK_PUSHX(stack, voidptr, cat->left); 2198 STACK_PUSHX(stack, voidptr, cat->right); 2199 break; 2200 2201 case ITERATION: 2202 /* A match with an empty string is preferred over no match at 2203 all, so we go through the argument if possible. */ 2204 iter = (tre_iteration_t *)node->obj; 2205 if (iter->arg->nullable) 2206 STACK_PUSHX(stack, voidptr, iter->arg); 2207 break; 2208 2209 default: 2210 assert(0); 2211 break; 2212 } 2213 } 2214 2215 return status; 2216} 2217 2218 2219typedef enum { 2220 NFL_RECURSE, 2221 NFL_POST_UNION, 2222 NFL_POST_CATENATION, 2223 NFL_POST_ITERATION 2224} tre_nfl_stack_symbol_t; 2225 2226 2227/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for 2228 the nodes of the AST `tree'. */ 2229static reg_errcode_t 2230tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) 2231{ 2232 int bottom = tre_stack_num_objects(stack); 2233 2234 STACK_PUSHR(stack, voidptr, tree); 2235 STACK_PUSHR(stack, int, NFL_RECURSE); 2236 2237 while (tre_stack_num_objects(stack) > bottom) 2238 { 2239 tre_nfl_stack_symbol_t symbol; 2240 tre_ast_node_t *node; 2241 2242 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack); 2243 node = tre_stack_pop_voidptr(stack); 2244 switch (symbol) 2245 { 2246 case NFL_RECURSE: 2247 switch (node->type) 2248 { 2249 case LITERAL: 2250 { 2251 tre_literal_t *lit = (tre_literal_t *)node->obj; 2252 if (IS_BACKREF(lit)) 2253 { 2254 /* Back references: nullable = false, firstpos = {i}, 2255 lastpos = {i}. */ 2256 node->nullable = 0; 2257 node->firstpos = tre_set_one(mem, lit->position, 0, 2258 TRE_CHAR_MAX, 0, NULL, -1); 2259 if (!node->firstpos) 2260 return REG_ESPACE; 2261 node->lastpos = tre_set_one(mem, lit->position, 0, 2262 TRE_CHAR_MAX, 0, NULL, 2263 (int)lit->code_max); 2264 if (!node->lastpos) 2265 return REG_ESPACE; 2266 } 2267 else if (lit->code_min < 0) 2268 { 2269 /* Tags, empty strings, params, and zero width assertions: 2270 nullable = true, firstpos = {}, and lastpos = {}. */ 2271 node->nullable = 1; 2272 node->firstpos = tre_set_empty(mem); 2273 if (!node->firstpos) 2274 return REG_ESPACE; 2275 node->lastpos = tre_set_empty(mem); 2276 if (!node->lastpos) 2277 return REG_ESPACE; 2278 } 2279 else 2280 { 2281 /* Literal at position i: nullable = false, firstpos = {i}, 2282 lastpos = {i}. */ 2283 node->nullable = 0; 2284 node->firstpos = 2285 tre_set_one(mem, lit->position, (int)lit->code_min, 2286 (int)lit->code_max, 0, NULL, -1); 2287 if (!node->firstpos) 2288 return REG_ESPACE; 2289 node->lastpos = tre_set_one(mem, lit->position, 2290 (int)lit->code_min, 2291 (int)lit->code_max, 2292 lit->class, lit->neg_classes, 2293 -1); 2294 if (!node->lastpos) 2295 return REG_ESPACE; 2296 } 2297 break; 2298 } 2299 2300 case UNION: 2301 /* Compute the attributes for the two subtrees, and after that 2302 for this node. */ 2303 STACK_PUSHR(stack, voidptr, node); 2304 STACK_PUSHR(stack, int, NFL_POST_UNION); 2305 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right); 2306 STACK_PUSHR(stack, int, NFL_RECURSE); 2307 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left); 2308 STACK_PUSHR(stack, int, NFL_RECURSE); 2309 break; 2310 2311 case CATENATION: 2312 /* Compute the attributes for the two subtrees, and after that 2313 for this node. */ 2314 STACK_PUSHR(stack, voidptr, node); 2315 STACK_PUSHR(stack, int, NFL_POST_CATENATION); 2316 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right); 2317 STACK_PUSHR(stack, int, NFL_RECURSE); 2318 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left); 2319 STACK_PUSHR(stack, int, NFL_RECURSE); 2320 break; 2321 2322 case ITERATION: 2323 /* Compute the attributes for the subtree, and after that for 2324 this node. */ 2325 STACK_PUSHR(stack, voidptr, node); 2326 STACK_PUSHR(stack, int, NFL_POST_ITERATION); 2327 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg); 2328 STACK_PUSHR(stack, int, NFL_RECURSE); 2329 break; 2330 } 2331 break; /* end case: NFL_RECURSE */ 2332 2333 case NFL_POST_UNION: 2334 { 2335 tre_union_t *uni = (tre_union_t *)node->obj; 2336 node->nullable = uni->left->nullable || uni->right->nullable; 2337 node->firstpos = tre_set_union(mem, uni->left->firstpos, 2338 uni->right->firstpos, NULL, 0); 2339 if (!node->firstpos) 2340 return REG_ESPACE; 2341 node->lastpos = tre_set_union(mem, uni->left->lastpos, 2342 uni->right->lastpos, NULL, 0); 2343 if (!node->lastpos) 2344 return REG_ESPACE; 2345 break; 2346 } 2347 2348 case NFL_POST_ITERATION: 2349 { 2350 tre_iteration_t *iter = (tre_iteration_t *)node->obj; 2351 2352 if (iter->min == 0 || iter->arg->nullable) 2353 node->nullable = 1; 2354 else 2355 node->nullable = 0; 2356 node->firstpos = iter->arg->firstpos; 2357 node->lastpos = iter->arg->lastpos; 2358 break; 2359 } 2360 2361 case NFL_POST_CATENATION: 2362 { 2363 int num_tags, *tags, assertions; 2364 reg_errcode_t status; 2365 tre_catenation_t *cat = node->obj; 2366 node->nullable = cat->left->nullable && cat->right->nullable; 2367 2368 /* Compute firstpos. */ 2369 if (cat->left->nullable) 2370 { 2371 /* The left side matches the empty string. Make a first pass 2372 with tre_match_empty() to get the number of tags and 2373 parameters. */ 2374 status = tre_match_empty(stack, cat->left, 2375 NULL, NULL, &num_tags); 2376 if (status != REG_OK) 2377 return status; 2378 /* Allocate arrays for the tags and parameters. */ 2379 tags = xmalloc(sizeof(*tags) * (num_tags + 1)); 2380 if (!tags) 2381 return REG_ESPACE; 2382 tags[0] = -1; 2383 assertions = 0; 2384 /* Second pass with tre_mach_empty() to get the list of 2385 tags and parameters. */ 2386 status = tre_match_empty(stack, cat->left, tags, 2387 &assertions, NULL); 2388 if (status != REG_OK) 2389 { 2390 xfree(tags); 2391 return status; 2392 } 2393 node->firstpos = 2394 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos, 2395 tags, assertions); 2396 xfree(tags); 2397 if (!node->firstpos) 2398 return REG_ESPACE; 2399 } 2400 else 2401 { 2402 node->firstpos = cat->left->firstpos; 2403 } 2404 2405 /* Compute lastpos. */ 2406 if (cat->right->nullable) 2407 { 2408 /* The right side matches the empty string. Make a first pass 2409 with tre_match_empty() to get the number of tags and 2410 parameters. */ 2411 status = tre_match_empty(stack, cat->right, 2412 NULL, NULL, &num_tags); 2413 if (status != REG_OK) 2414 return status; 2415 /* Allocate arrays for the tags and parameters. */ 2416 tags = xmalloc(sizeof(int) * (num_tags + 1)); 2417 if (!tags) 2418 return REG_ESPACE; 2419 tags[0] = -1; 2420 assertions = 0; 2421 /* Second pass with tre_mach_empty() to get the list of 2422 tags and parameters. */ 2423 status = tre_match_empty(stack, cat->right, tags, 2424 &assertions, NULL); 2425 if (status != REG_OK) 2426 { 2427 xfree(tags); 2428 return status; 2429 } 2430 node->lastpos = 2431 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos, 2432 tags, assertions); 2433 xfree(tags); 2434 if (!node->lastpos) 2435 return REG_ESPACE; 2436 } 2437 else 2438 { 2439 node->lastpos = cat->right->lastpos; 2440 } 2441 break; 2442 } 2443 2444 default: 2445 assert(0); 2446 break; 2447 } 2448 } 2449 2450 return REG_OK; 2451} 2452 2453 2454/* Adds a transition from each position in `p1' to each position in `p2'. */ 2455static reg_errcode_t 2456tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, 2457 tre_tnfa_transition_t *transitions, 2458 int *counts, int *offs) 2459{ 2460 tre_pos_and_tags_t *orig_p2 = p2; 2461 tre_tnfa_transition_t *trans; 2462 int i, j, k, l, dup, prev_p2_pos; 2463 2464 if (transitions != NULL) 2465 while (p1->position >= 0) 2466 { 2467 p2 = orig_p2; 2468 prev_p2_pos = -1; 2469 while (p2->position >= 0) 2470 { 2471 /* Optimization: if this position was already handled, skip it. */ 2472 if (p2->position == prev_p2_pos) 2473 { 2474 p2++; 2475 continue; 2476 } 2477 prev_p2_pos = p2->position; 2478 /* Set `trans' to point to the next unused transition from 2479 position `p1->position'. */ 2480 trans = transitions + offs[p1->position]; 2481 while (trans->state != NULL) 2482 { 2483#if 0 2484 /* If we find a previous transition from `p1->position' to 2485 `p2->position', it is overwritten. This can happen only 2486 if there are nested loops in the regexp, like in "((a)*)*". 2487 In POSIX.2 repetition using the outer loop is always 2488 preferred over using the inner loop. Therefore the 2489 transition for the inner loop is useless and can be thrown 2490 away. */ 2491 /* XXX - The same position is used for all nodes in a bracket 2492 expression, so this optimization cannot be used (it will 2493 break bracket expressions) unless I figure out a way to 2494 detect it here. */ 2495 if (trans->state_id == p2->position) 2496 { 2497 break; 2498 } 2499#endif 2500 trans++; 2501 } 2502 2503 if (trans->state == NULL) 2504 (trans + 1)->state = NULL; 2505 /* Use the character ranges, assertions, etc. from `p1' for 2506 the transition from `p1' to `p2'. */ 2507 trans->code_min = p1->code_min; 2508 trans->code_max = p1->code_max; 2509 trans->state = transitions + offs[p2->position]; 2510 trans->state_id = p2->position; 2511 trans->assertions = p1->assertions | p2->assertions 2512 | (p1->class ? ASSERT_CHAR_CLASS : 0) 2513 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0); 2514 if (p1->backref >= 0) 2515 { 2516 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0); 2517 assert(p2->backref < 0); 2518 trans->u.backref = p1->backref; 2519 trans->assertions |= ASSERT_BACKREF; 2520 } 2521 else 2522 trans->u.class = p1->class; 2523 if (p1->neg_classes != NULL) 2524 { 2525 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++); 2526 trans->neg_classes = 2527 xmalloc(sizeof(*trans->neg_classes) * (i + 1)); 2528 if (trans->neg_classes == NULL) 2529 return REG_ESPACE; 2530 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++) 2531 trans->neg_classes[i] = p1->neg_classes[i]; 2532 trans->neg_classes[i] = (tre_ctype_t)0; 2533 } 2534 else 2535 trans->neg_classes = NULL; 2536 2537 /* Find out how many tags this transition has. */ 2538 i = 0; 2539 if (p1->tags != NULL) 2540 while(p1->tags[i] >= 0) 2541 i++; 2542 j = 0; 2543 if (p2->tags != NULL) 2544 while(p2->tags[j] >= 0) 2545 j++; 2546 2547 /* If we are overwriting a transition, free the old tag array. */ 2548 if (trans->tags != NULL) 2549 xfree(trans->tags); 2550 trans->tags = NULL; 2551 2552 /* If there were any tags, allocate an array and fill it. */ 2553 if (i + j > 0) 2554 { 2555 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1)); 2556 if (!trans->tags) 2557 return REG_ESPACE; 2558 i = 0; 2559 if (p1->tags != NULL) 2560 while(p1->tags[i] >= 0) 2561 { 2562 trans->tags[i] = p1->tags[i]; 2563 i++; 2564 } 2565 l = i; 2566 j = 0; 2567 if (p2->tags != NULL) 2568 while (p2->tags[j] >= 0) 2569 { 2570 /* Don't add duplicates. */ 2571 dup = 0; 2572 for (k = 0; k < i; k++) 2573 if (trans->tags[k] == p2->tags[j]) 2574 { 2575 dup = 1; 2576 break; 2577 } 2578 if (!dup) 2579 trans->tags[l++] = p2->tags[j]; 2580 j++; 2581 } 2582 trans->tags[l] = -1; 2583 } 2584 2585 p2++; 2586 } 2587 p1++; 2588 } 2589 else 2590 /* Compute a maximum limit for the number of transitions leaving 2591 from each state. */ 2592 while (p1->position >= 0) 2593 { 2594 p2 = orig_p2; 2595 while (p2->position >= 0) 2596 { 2597 counts[p1->position]++; 2598 p2++; 2599 } 2600 p1++; 2601 } 2602 return REG_OK; 2603} 2604 2605/* Converts the syntax tree to a TNFA. All the transitions in the TNFA are 2606 labelled with one character range (there are no transitions on empty 2607 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of 2608 the regexp. */ 2609static reg_errcode_t 2610tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions, 2611 int *counts, int *offs) 2612{ 2613 tre_union_t *uni; 2614 tre_catenation_t *cat; 2615 tre_iteration_t *iter; 2616 reg_errcode_t errcode = REG_OK; 2617 2618 /* XXX - recurse using a stack!. */ 2619 switch (node->type) 2620 { 2621 case LITERAL: 2622 break; 2623 case UNION: 2624 uni = (tre_union_t *)node->obj; 2625 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs); 2626 if (errcode != REG_OK) 2627 return errcode; 2628 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs); 2629 break; 2630 2631 case CATENATION: 2632 cat = (tre_catenation_t *)node->obj; 2633 /* Add a transition from each position in cat->left->lastpos 2634 to each position in cat->right->firstpos. */ 2635 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos, 2636 transitions, counts, offs); 2637 if (errcode != REG_OK) 2638 return errcode; 2639 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs); 2640 if (errcode != REG_OK) 2641 return errcode; 2642 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs); 2643 break; 2644 2645 case ITERATION: 2646 iter = (tre_iteration_t *)node->obj; 2647 assert(iter->max == -1 || iter->max == 1); 2648 2649 if (iter->max == -1) 2650 { 2651 assert(iter->min == 0 || iter->min == 1); 2652 /* Add a transition from each last position in the iterated 2653 expression to each first position. */ 2654 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos, 2655 transitions, counts, offs); 2656 if (errcode != REG_OK) 2657 return errcode; 2658 } 2659 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs); 2660 break; 2661 } 2662 return errcode; 2663} 2664 2665 2666#define ERROR_EXIT(err) \ 2667 do \ 2668 { \ 2669 errcode = err; \ 2670 if (/*CONSTCOND*/1) \ 2671 goto error_exit; \ 2672 } \ 2673 while (/*CONSTCOND*/0) 2674 2675 2676int 2677regcomp(regex_t *restrict preg, const char *restrict regex, int cflags) 2678{ 2679 tre_stack_t *stack; 2680 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r; 2681 tre_pos_and_tags_t *p; 2682 int *counts = NULL, *offs = NULL; 2683 int i, add = 0; 2684 tre_tnfa_transition_t *transitions, *initial; 2685 tre_tnfa_t *tnfa = NULL; 2686 tre_submatch_data_t *submatch_data; 2687 tre_tag_direction_t *tag_directions = NULL; 2688 reg_errcode_t errcode; 2689 tre_mem_t mem; 2690 2691 /* Parse context. */ 2692 tre_parse_ctx_t parse_ctx; 2693 2694 /* Allocate a stack used throughout the compilation process for various 2695 purposes. */ 2696 stack = tre_stack_new(512, 1024000, 128); 2697 if (!stack) 2698 return REG_ESPACE; 2699 /* Allocate a fast memory allocator. */ 2700 mem = tre_mem_new(); 2701 if (!mem) 2702 { 2703 tre_stack_destroy(stack); 2704 return REG_ESPACE; 2705 } 2706 2707 /* Parse the regexp. */ 2708 memset(&parse_ctx, 0, sizeof(parse_ctx)); 2709 parse_ctx.mem = mem; 2710 parse_ctx.stack = stack; 2711 parse_ctx.start = regex; 2712 parse_ctx.cflags = cflags; 2713 parse_ctx.max_backref = -1; 2714 errcode = tre_parse(&parse_ctx); 2715 if (errcode != REG_OK) 2716 ERROR_EXIT(errcode); 2717 preg->re_nsub = parse_ctx.submatch_id - 1; 2718 tree = parse_ctx.n; 2719 2720#ifdef TRE_DEBUG 2721 tre_ast_print(tree); 2722#endif /* TRE_DEBUG */ 2723 2724 /* Referring to nonexistent subexpressions is illegal. */ 2725 if (parse_ctx.max_backref > (int)preg->re_nsub) 2726 ERROR_EXIT(REG_ESUBREG); 2727 2728 /* Allocate the TNFA struct. */ 2729 tnfa = xcalloc(1, sizeof(tre_tnfa_t)); 2730 if (tnfa == NULL) 2731 ERROR_EXIT(REG_ESPACE); 2732 tnfa->have_backrefs = parse_ctx.max_backref >= 0; 2733 tnfa->have_approx = 0; 2734 tnfa->num_submatches = parse_ctx.submatch_id; 2735 2736 /* Set up tags for submatch addressing. If REG_NOSUB is set and the 2737 regexp does not have back references, this can be skipped. */ 2738 if (tnfa->have_backrefs || !(cflags & REG_NOSUB)) 2739 { 2740 2741 /* Figure out how many tags we will need. */ 2742 errcode = tre_add_tags(NULL, stack, tree, tnfa); 2743 if (errcode != REG_OK) 2744 ERROR_EXIT(errcode); 2745 2746 if (tnfa->num_tags > 0) 2747 { 2748 tag_directions = xmalloc(sizeof(*tag_directions) 2749 * (tnfa->num_tags + 1)); 2750 if (tag_directions == NULL) 2751 ERROR_EXIT(REG_ESPACE); 2752 tnfa->tag_directions = tag_directions; 2753 memset(tag_directions, -1, 2754 sizeof(*tag_directions) * (tnfa->num_tags + 1)); 2755 } 2756 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1, 2757 sizeof(*tnfa->minimal_tags)); 2758 if (tnfa->minimal_tags == NULL) 2759 ERROR_EXIT(REG_ESPACE); 2760 2761 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id, 2762 sizeof(*submatch_data)); 2763 if (submatch_data == NULL) 2764 ERROR_EXIT(REG_ESPACE); 2765 tnfa->submatch_data = submatch_data; 2766 2767 errcode = tre_add_tags(mem, stack, tree, tnfa); 2768 if (errcode != REG_OK) 2769 ERROR_EXIT(errcode); 2770 2771 } 2772 2773 /* Expand iteration nodes. */ 2774 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, 2775 tag_directions); 2776 if (errcode != REG_OK) 2777 ERROR_EXIT(errcode); 2778 2779 /* Add a dummy node for the final state. 2780 XXX - For certain patterns this dummy node can be optimized away, 2781 for example "a*" or "ab*". Figure out a simple way to detect 2782 this possibility. */ 2783 tmp_ast_l = tree; 2784 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++); 2785 if (tmp_ast_r == NULL) 2786 ERROR_EXIT(REG_ESPACE); 2787 2788 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r); 2789 if (tree == NULL) 2790 ERROR_EXIT(REG_ESPACE); 2791 2792 errcode = tre_compute_nfl(mem, stack, tree); 2793 if (errcode != REG_OK) 2794 ERROR_EXIT(errcode); 2795 2796 counts = xmalloc(sizeof(int) * parse_ctx.position); 2797 if (counts == NULL) 2798 ERROR_EXIT(REG_ESPACE); 2799 2800 offs = xmalloc(sizeof(int) * parse_ctx.position); 2801 if (offs == NULL) 2802 ERROR_EXIT(REG_ESPACE); 2803 2804 for (i = 0; i < parse_ctx.position; i++) 2805 counts[i] = 0; 2806 tre_ast_to_tnfa(tree, NULL, counts, NULL); 2807 2808 add = 0; 2809 for (i = 0; i < parse_ctx.position; i++) 2810 { 2811 offs[i] = add; 2812 add += counts[i] + 1; 2813 counts[i] = 0; 2814 } 2815 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions)); 2816 if (transitions == NULL) 2817 ERROR_EXIT(REG_ESPACE); 2818 tnfa->transitions = transitions; 2819 tnfa->num_transitions = add; 2820 2821 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs); 2822 if (errcode != REG_OK) 2823 ERROR_EXIT(errcode); 2824 2825 tnfa->firstpos_chars = NULL; 2826 2827 p = tree->firstpos; 2828 i = 0; 2829 while (p->position >= 0) 2830 { 2831 i++; 2832 p++; 2833 } 2834 2835 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t)); 2836 if (initial == NULL) 2837 ERROR_EXIT(REG_ESPACE); 2838 tnfa->initial = initial; 2839 2840 i = 0; 2841 for (p = tree->firstpos; p->position >= 0; p++) 2842 { 2843 initial[i].state = transitions + offs[p->position]; 2844 initial[i].state_id = p->position; 2845 initial[i].tags = NULL; 2846 /* Copy the arrays p->tags, and p->params, they are allocated 2847 from a tre_mem object. */ 2848 if (p->tags) 2849 { 2850 int j; 2851 for (j = 0; p->tags[j] >= 0; j++); 2852 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1)); 2853 if (!initial[i].tags) 2854 ERROR_EXIT(REG_ESPACE); 2855 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1)); 2856 } 2857 initial[i].assertions = p->assertions; 2858 i++; 2859 } 2860 initial[i].state = NULL; 2861 2862 tnfa->num_transitions = add; 2863 tnfa->final = transitions + offs[tree->lastpos[0].position]; 2864 tnfa->num_states = parse_ctx.position; 2865 tnfa->cflags = cflags; 2866 2867 tre_mem_destroy(mem); 2868 tre_stack_destroy(stack); 2869 xfree(counts); 2870 xfree(offs); 2871 2872 preg->TRE_REGEX_T_FIELD = (void *)tnfa; 2873 return REG_OK; 2874 2875 error_exit: 2876 /* Free everything that was allocated and return the error code. */ 2877 tre_mem_destroy(mem); 2878 if (stack != NULL) 2879 tre_stack_destroy(stack); 2880 if (counts != NULL) 2881 xfree(counts); 2882 if (offs != NULL) 2883 xfree(offs); 2884 preg->TRE_REGEX_T_FIELD = (void *)tnfa; 2885 regfree(preg); 2886 return errcode; 2887} 2888 2889 2890 2891 2892void 2893regfree(regex_t *preg) 2894{ 2895 tre_tnfa_t *tnfa; 2896 unsigned int i; 2897 tre_tnfa_transition_t *trans; 2898 2899 tnfa = (void *)preg->TRE_REGEX_T_FIELD; 2900 if (!tnfa) 2901 return; 2902 2903 for (i = 0; i < tnfa->num_transitions; i++) 2904 if (tnfa->transitions[i].state) 2905 { 2906 if (tnfa->transitions[i].tags) 2907 xfree(tnfa->transitions[i].tags); 2908 if (tnfa->transitions[i].neg_classes) 2909 xfree(tnfa->transitions[i].neg_classes); 2910 } 2911 if (tnfa->transitions) 2912 xfree(tnfa->transitions); 2913 2914 if (tnfa->initial) 2915 { 2916 for (trans = tnfa->initial; trans->state; trans++) 2917 { 2918 if (trans->tags) 2919 xfree(trans->tags); 2920 } 2921 xfree(tnfa->initial); 2922 } 2923 2924 if (tnfa->submatch_data) 2925 { 2926 for (i = 0; i < tnfa->num_submatches; i++) 2927 if (tnfa->submatch_data[i].parents) 2928 xfree(tnfa->submatch_data[i].parents); 2929 xfree(tnfa->submatch_data); 2930 } 2931 2932 if (tnfa->tag_directions) 2933 xfree(tnfa->tag_directions); 2934 if (tnfa->firstpos_chars) 2935 xfree(tnfa->firstpos_chars); 2936 if (tnfa->minimal_tags) 2937 xfree(tnfa->minimal_tags); 2938 xfree(tnfa); 2939} 2940