265 g->nplus = pluscount(p, g); 266 g->magic = MAGIC2; 267 preg->re_nsub = g->nsub; 268 preg->re_g = g; 269 preg->re_magic = MAGIC1; 270#ifndef REDEBUG 271 /* not debugging, so can't rely on the assert() in regexec() */ 272 if (g->iflags&BAD) 273 SETERROR(REG_ASSERT); 274#endif 275 276 /* win or lose, we're done */ 277 if (p->error != 0) /* lose */ 278 regfree(preg); 279 return(p->error); 280} 281 282/* 283 - p_ere - ERE parser top level, concatenation and alternation 284 == static void p_ere(register struct parse *p, int stop); 285 */ 286static void 287p_ere(p, stop) 288register struct parse *p; 289int stop; /* character this ERE should end at */ 290{ 291 register char c; 292 register sopno prevback; 293 register sopno prevfwd; 294 register sopno conc; 295 register int first = 1; /* is this the first alternative? */ 296 297 for (;;) { 298 /* do a bunch of concatenated expressions */ 299 conc = HERE(); 300 while (MORE() && (c = PEEK()) != '|' && c != stop) 301 p_ere_exp(p); 302 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 303 304 if (!EAT('|')) 305 break; /* NOTE BREAK OUT */ 306 307 if (first) { 308 INSERT(OCH_, conc); /* offset is wrong */ 309 prevfwd = conc; 310 prevback = conc; 311 first = 0; 312 } 313 ASTERN(OOR1, prevback); 314 prevback = THERE(); 315 AHEAD(prevfwd); /* fix previous offset */ 316 prevfwd = HERE(); 317 EMIT(OOR2, 0); /* offset is very wrong */ 318 } 319 320 if (!first) { /* tail-end fixups */ 321 AHEAD(prevfwd); 322 ASTERN(O_CH, prevback); 323 } 324 325 assert(!MORE() || SEE(stop)); 326} 327 328/* 329 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 330 == static void p_ere_exp(register struct parse *p); 331 */ 332static void 333p_ere_exp(p) 334register struct parse *p; 335{ 336 register char c; 337 register sopno pos; 338 register int count; 339 register int count2; 340 register sopno subno; 341 int wascaret = 0; 342 343 assert(MORE()); /* caller should have ensured this */ 344 c = GETNEXT(); 345 346 pos = HERE(); 347 switch (c) { 348 case '(': 349 (void)REQUIRE(MORE(), REG_EPAREN); 350 p->g->nsub++; 351 subno = p->g->nsub; 352 if (subno < NPAREN) 353 p->pbegin[subno] = HERE(); 354 EMIT(OLPAREN, subno); 355 if (!SEE(')')) 356 p_ere(p, ')'); 357 if (subno < NPAREN) { 358 p->pend[subno] = HERE(); 359 assert(p->pend[subno] != 0); 360 } 361 EMIT(ORPAREN, subno); 362 (void)MUSTEAT(')', REG_EPAREN); 363 break; 364#ifndef POSIX_MISTAKE 365 case ')': /* happens only if no current unmatched ( */ 366 /* 367 * You may ask, why the ifndef? Because I didn't notice 368 * this until slightly too late for 1003.2, and none of the 369 * other 1003.2 regular-expression reviewers noticed it at 370 * all. So an unmatched ) is legal POSIX, at least until 371 * we can get it fixed. 372 */ 373 SETERROR(REG_EPAREN); 374 break; 375#endif 376 case '^': 377 EMIT(OBOL, 0); 378 p->g->iflags |= USEBOL; 379 p->g->nbol++; 380 wascaret = 1; 381 break; 382 case '$': 383 EMIT(OEOL, 0); 384 p->g->iflags |= USEEOL; 385 p->g->neol++; 386 break; 387 case '|': 388 SETERROR(REG_EMPTY); 389 break; 390 case '*': 391 case '+': 392 case '?': 393 SETERROR(REG_BADRPT); 394 break; 395 case '.': 396 if (p->g->cflags®_NEWLINE) 397 nonnewline(p); 398 else 399 EMIT(OANY, 0); 400 break; 401 case '[': 402 p_bracket(p); 403 break; 404 case '\\': 405 (void)REQUIRE(MORE(), REG_EESCAPE); 406 c = GETNEXT(); 407 ordinary(p, c); 408 break; 409 case '{': /* okay as ordinary except if digit follows */ 410 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 411 /* FALLTHROUGH */ 412 default: 413 ordinary(p, c); 414 break; 415 } 416 417 if (!MORE()) 418 return; 419 c = PEEK(); 420 /* we call { a repetition if followed by a digit */ 421 if (!( c == '*' || c == '+' || c == '?' || 422 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 423 return; /* no repetition, we're done */ 424 NEXT(); 425 426 (void)REQUIRE(!wascaret, REG_BADRPT); 427 switch (c) { 428 case '*': /* implemented as +? */ 429 /* this case does not require the (y|) trick, noKLUDGE */ 430 INSERT(OPLUS_, pos); 431 ASTERN(O_PLUS, pos); 432 INSERT(OQUEST_, pos); 433 ASTERN(O_QUEST, pos); 434 break; 435 case '+': 436 INSERT(OPLUS_, pos); 437 ASTERN(O_PLUS, pos); 438 break; 439 case '?': 440 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 441 INSERT(OCH_, pos); /* offset slightly wrong */ 442 ASTERN(OOR1, pos); /* this one's right */ 443 AHEAD(pos); /* fix the OCH_ */ 444 EMIT(OOR2, 0); /* offset very wrong... */ 445 AHEAD(THERE()); /* ...so fix it */ 446 ASTERN(O_CH, THERETHERE()); 447 break; 448 case '{': 449 count = p_count(p); 450 if (EAT(',')) { 451 if (isdigit((uch)PEEK())) { 452 count2 = p_count(p); 453 (void)REQUIRE(count <= count2, REG_BADBR); 454 } else /* single number with comma */ 455 count2 = INFINITY; 456 } else /* just a single number */ 457 count2 = count; 458 repeat(p, pos, count, count2); 459 if (!EAT('}')) { /* error heuristics */ 460 while (MORE() && PEEK() != '}') 461 NEXT(); 462 (void)REQUIRE(MORE(), REG_EBRACE); 463 SETERROR(REG_BADBR); 464 } 465 break; 466 } 467 468 if (!MORE()) 469 return; 470 c = PEEK(); 471 if (!( c == '*' || c == '+' || c == '?' || 472 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 473 return; 474 SETERROR(REG_BADRPT); 475} 476 477/* 478 - p_str - string (no metacharacters) "parser" 479 == static void p_str(register struct parse *p); 480 */ 481static void 482p_str(p) 483register struct parse *p; 484{ 485 (void)REQUIRE(MORE(), REG_EMPTY); 486 while (MORE()) 487 ordinary(p, GETNEXT()); 488} 489 490/* 491 - p_bre - BRE parser top level, anchoring and concatenation 492 == static void p_bre(register struct parse *p, register int end1, \ 493 == register int end2); 494 * Giving end1 as OUT essentially eliminates the end1/end2 check. 495 * 496 * This implementation is a bit of a kludge, in that a trailing $ is first 497 * taken as an ordinary character and then revised to be an anchor. The 498 * only undesirable side effect is that '$' gets included as a character 499 * category in such cases. This is fairly harmless; not worth fixing. 500 * The amount of lookahead needed to avoid this kludge is excessive. 501 */ 502static void 503p_bre(p, end1, end2) 504register struct parse *p; 505register int end1; /* first terminating character */ 506register int end2; /* second terminating character */ 507{ 508 register sopno start = HERE(); 509 register int first = 1; /* first subexpression? */ 510 register int wasdollar = 0; 511 512 if (EAT('^')) { 513 EMIT(OBOL, 0); 514 p->g->iflags |= USEBOL; 515 p->g->nbol++; 516 } 517 while (MORE() && !SEETWO(end1, end2)) { 518 wasdollar = p_simp_re(p, first); 519 first = 0; 520 } 521 if (wasdollar) { /* oops, that was a trailing anchor */ 522 DROP(1); 523 EMIT(OEOL, 0); 524 p->g->iflags |= USEEOL; 525 p->g->neol++; 526 } 527 528 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 529} 530 531/* 532 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 533 == static int p_simp_re(register struct parse *p, int starordinary); 534 */ 535static int /* was the simple RE an unbackslashed $? */ 536p_simp_re(p, starordinary) 537register struct parse *p; 538int starordinary; /* is a leading * an ordinary character? */ 539{ 540 register int c; 541 register int count; 542 register int count2; 543 register sopno pos; 544 register int i; 545 register sopno subno; 546# define BACKSL (1<<CHAR_BIT) 547 548 pos = HERE(); /* repetion op, if any, covers from here */ 549 550 assert(MORE()); /* caller should have ensured this */ 551 c = GETNEXT(); 552 if (c == '\\') { 553 (void)REQUIRE(MORE(), REG_EESCAPE); 554 c = BACKSL | GETNEXT(); 555 } 556 switch (c) { 557 case '.': 558 if (p->g->cflags®_NEWLINE) 559 nonnewline(p); 560 else 561 EMIT(OANY, 0); 562 break; 563 case '[': 564 p_bracket(p); 565 break; 566 case BACKSL|'{': 567 SETERROR(REG_BADRPT); 568 break; 569 case BACKSL|'(': 570 p->g->nsub++; 571 subno = p->g->nsub; 572 if (subno < NPAREN) 573 p->pbegin[subno] = HERE(); 574 EMIT(OLPAREN, subno); 575 /* the MORE here is an error heuristic */ 576 if (MORE() && !SEETWO('\\', ')')) 577 p_bre(p, '\\', ')'); 578 if (subno < NPAREN) { 579 p->pend[subno] = HERE(); 580 assert(p->pend[subno] != 0); 581 } 582 EMIT(ORPAREN, subno); 583 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 584 break; 585 case BACKSL|')': /* should not get here -- must be user */ 586 case BACKSL|'}': 587 SETERROR(REG_EPAREN); 588 break; 589 case BACKSL|'1': 590 case BACKSL|'2': 591 case BACKSL|'3': 592 case BACKSL|'4': 593 case BACKSL|'5': 594 case BACKSL|'6': 595 case BACKSL|'7': 596 case BACKSL|'8': 597 case BACKSL|'9': 598 i = (c&~BACKSL) - '0'; 599 assert(i < NPAREN); 600 if (p->pend[i] != 0) { 601 assert(i <= p->g->nsub); 602 EMIT(OBACK_, i); 603 assert(p->pbegin[i] != 0); 604 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 605 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 606 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 607 EMIT(O_BACK, i); 608 } else 609 SETERROR(REG_ESUBREG); 610 p->g->backrefs = 1; 611 break; 612 case '*': 613 (void)REQUIRE(starordinary, REG_BADRPT); 614 /* FALLTHROUGH */ 615 default: 616 ordinary(p, (char)c); 617 break; 618 } 619 620 if (EAT('*')) { /* implemented as +? */ 621 /* this case does not require the (y|) trick, noKLUDGE */ 622 INSERT(OPLUS_, pos); 623 ASTERN(O_PLUS, pos); 624 INSERT(OQUEST_, pos); 625 ASTERN(O_QUEST, pos); 626 } else if (EATTWO('\\', '{')) { 627 count = p_count(p); 628 if (EAT(',')) { 629 if (MORE() && isdigit((uch)PEEK())) { 630 count2 = p_count(p); 631 (void)REQUIRE(count <= count2, REG_BADBR); 632 } else /* single number with comma */ 633 count2 = INFINITY; 634 } else /* just a single number */ 635 count2 = count; 636 repeat(p, pos, count, count2); 637 if (!EATTWO('\\', '}')) { /* error heuristics */ 638 while (MORE() && !SEETWO('\\', '}')) 639 NEXT(); 640 (void)REQUIRE(MORE(), REG_EBRACE); 641 SETERROR(REG_BADBR); 642 } 643 } else if (c == '$') /* $ (but not \$) ends it */ 644 return(1); 645 646 return(0); 647} 648 649/* 650 - p_count - parse a repetition count 651 == static int p_count(register struct parse *p); 652 */ 653static int /* the value */ 654p_count(p) 655register struct parse *p; 656{ 657 register int count = 0; 658 register int ndigits = 0; 659 660 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 661 count = count*10 + (GETNEXT() - '0'); 662 ndigits++; 663 } 664 665 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 666 return(count); 667} 668 669/* 670 - p_bracket - parse a bracketed character list 671 == static void p_bracket(register struct parse *p); 672 * 673 * Note a significant property of this code: if the allocset() did SETERROR, 674 * no set operations are done. 675 */ 676static void 677p_bracket(p) 678register struct parse *p; 679{ 680 register cset *cs = allocset(p); 681 register int invert = 0; 682 683 /* Dept of Truly Sickening Special-Case Kludges */ 684 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 685 EMIT(OBOW, 0); 686 NEXTn(6); 687 return; 688 } 689 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 690 EMIT(OEOW, 0); 691 NEXTn(6); 692 return; 693 } 694 695 if (EAT('^')) 696 invert++; /* make note to invert set at end */ 697 if (EAT(']')) 698 CHadd(cs, ']'); 699 else if (EAT('-')) 700 CHadd(cs, '-'); 701 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 702 p_b_term(p, cs); 703 if (EAT('-')) 704 CHadd(cs, '-'); 705 (void)MUSTEAT(']', REG_EBRACK); 706 707 if (p->error != 0) /* don't mess things up further */ 708 return; 709 710 if (p->g->cflags®_ICASE) { 711 register int i; 712 register int ci; 713 714 for (i = p->g->csetsize - 1; i >= 0; i--) 715 if (CHIN(cs, i) && isalpha(i)) { 716 ci = othercase(i); 717 if (ci != i) 718 CHadd(cs, ci); 719 } 720 if (cs->multis != NULL) 721 mccase(p, cs); 722 } 723 if (invert) { 724 register int i; 725 726 for (i = p->g->csetsize - 1; i >= 0; i--) 727 if (CHIN(cs, i)) 728 CHsub(cs, i); 729 else 730 CHadd(cs, i); 731 if (p->g->cflags®_NEWLINE) 732 CHsub(cs, '\n'); 733 if (cs->multis != NULL) 734 mcinvert(p, cs); 735 } 736 737 assert(cs->multis == NULL); /* xxx */ 738 739 if (nch(p, cs) == 1) { /* optimize singleton sets */ 740 ordinary(p, firstch(p, cs)); 741 freeset(p, cs); 742 } else 743 EMIT(OANYOF, freezeset(p, cs)); 744} 745 746/* 747 - p_b_term - parse one term of a bracketed character list 748 == static void p_b_term(register struct parse *p, register cset *cs); 749 */ 750static void 751p_b_term(p, cs) 752register struct parse *p; 753register cset *cs; 754{ 755 register char c; 756 register char start, finish; 757 register int i; 758 759 /* classify what we've got */ 760 switch ((MORE()) ? PEEK() : '\0') { 761 case '[': 762 c = (MORE2()) ? PEEK2() : '\0'; 763 break; 764 case '-': 765 SETERROR(REG_ERANGE); 766 return; /* NOTE RETURN */ 767 break; 768 default: 769 c = '\0'; 770 break; 771 } 772 773 switch (c) { 774 case ':': /* character class */ 775 NEXT2(); 776 (void)REQUIRE(MORE(), REG_EBRACK); 777 c = PEEK(); 778 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 779 p_b_cclass(p, cs); 780 (void)REQUIRE(MORE(), REG_EBRACK); 781 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 782 break; 783 case '=': /* equivalence class */ 784 NEXT2(); 785 (void)REQUIRE(MORE(), REG_EBRACK); 786 c = PEEK(); 787 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 788 p_b_eclass(p, cs); 789 (void)REQUIRE(MORE(), REG_EBRACK); 790 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 791 break; 792 default: /* symbol, ordinary character, or range */ 793/* xxx revision needed for multichar stuff */ 794 start = p_b_symbol(p); 795 if (SEE('-') && MORE2() && PEEK2() != ']') { 796 /* range */ 797 NEXT(); 798 if (EAT('-')) 799 finish = '-'; 800 else 801 finish = p_b_symbol(p); 802 } else 803 finish = start; 804 if (start == finish) 805 CHadd(cs, start); 806 else { 807 if (__collate_load_error) { 808 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 809 for (i = (uch)start; i <= (uch)finish; i++) 810 CHadd(cs, i); 811 } else { 812 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 813 for (i = CHAR_MIN; i <= CHAR_MAX; i++) { 814 if ( __collate_range_cmp(start, i) <= 0 815 && __collate_range_cmp(i, finish) <= 0 816 ) 817 CHadd(cs, i); 818 } 819 } 820 } 821 break; 822 } 823} 824 825/* 826 - p_b_cclass - parse a character-class name and deal with it 827 == static void p_b_cclass(register struct parse *p, register cset *cs); 828 */ 829static void 830p_b_cclass(p, cs) 831register struct parse *p; 832register cset *cs; 833{ 834 register int c; 835 register char *sp = p->next; 836 register struct cclass *cp; 837 register size_t len; 838 839 while (MORE() && isalpha((uch)PEEK())) 840 NEXT(); 841 len = p->next - sp; 842 for (cp = cclasses; cp->name != NULL; cp++) 843 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 844 break; 845 if (cp->name == NULL) { 846 /* oops, didn't find it */ 847 SETERROR(REG_ECTYPE); 848 return; 849 } 850 851 switch (cp->fidx) { 852 case CALNUM: 853 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 854 if (isalnum((uch)c)) 855 CHadd(cs, c); 856 break; 857 case CALPHA: 858 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 859 if (isalpha((uch)c)) 860 CHadd(cs, c); 861 break; 862 case CBLANK: 863 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 864 if (isblank((uch)c)) 865 CHadd(cs, c); 866 break; 867 case CCNTRL: 868 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 869 if (iscntrl((uch)c)) 870 CHadd(cs, c); 871 break; 872 case CDIGIT: 873 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 874 if (isdigit((uch)c)) 875 CHadd(cs, c); 876 break; 877 case CGRAPH: 878 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 879 if (isgraph((uch)c)) 880 CHadd(cs, c); 881 break; 882 case CLOWER: 883 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 884 if (islower((uch)c)) 885 CHadd(cs, c); 886 break; 887 case CPRINT: 888 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 889 if (isprint((uch)c)) 890 CHadd(cs, c); 891 break; 892 case CPUNCT: 893 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 894 if (ispunct((uch)c)) 895 CHadd(cs, c); 896 break; 897 case CSPACE: 898 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 899 if (isspace((uch)c)) 900 CHadd(cs, c); 901 break; 902 case CUPPER: 903 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 904 if (isupper((uch)c)) 905 CHadd(cs, c); 906 break; 907 case CXDIGIT: 908 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 909 if (isxdigit((uch)c)) 910 CHadd(cs, c); 911 break; 912 } 913#if 0 914 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 915 MCadd(p, cs, u); 916#endif 917} 918 919/* 920 - p_b_eclass - parse an equivalence-class name and deal with it 921 == static void p_b_eclass(register struct parse *p, register cset *cs); 922 * 923 * This implementation is incomplete. xxx 924 */ 925static void 926p_b_eclass(p, cs) 927register struct parse *p; 928register cset *cs; 929{ 930 register char c; 931 932 c = p_b_coll_elem(p, '='); 933 CHadd(cs, c); 934} 935 936/* 937 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 938 == static char p_b_symbol(register struct parse *p); 939 */ 940static char /* value of symbol */ 941p_b_symbol(p) 942register struct parse *p; 943{ 944 register char value; 945 946 (void)REQUIRE(MORE(), REG_EBRACK); 947 if (!EATTWO('[', '.')) 948 return(GETNEXT()); 949 950 /* collating symbol */ 951 value = p_b_coll_elem(p, '.'); 952 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 953 return(value); 954} 955 956/* 957 - p_b_coll_elem - parse a collating-element name and look it up 958 == static char p_b_coll_elem(register struct parse *p, int endc); 959 */ 960static char /* value of collating element */ 961p_b_coll_elem(p, endc) 962register struct parse *p; 963int endc; /* name ended by endc,']' */ 964{ 965 register char *sp = p->next; 966 register struct cname *cp; 967 register int len; 968 969 while (MORE() && !SEETWO(endc, ']')) 970 NEXT(); 971 if (!MORE()) { 972 SETERROR(REG_EBRACK); 973 return(0); 974 } 975 len = p->next - sp; 976 for (cp = cnames; cp->name != NULL; cp++) 977 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 978 return(cp->code); /* known name */ 979 if (len == 1) 980 return(*sp); /* single character */ 981 SETERROR(REG_ECOLLATE); /* neither */ 982 return(0); 983} 984 985/* 986 - othercase - return the case counterpart of an alphabetic 987 == static char othercase(int ch); 988 */ 989static char /* if no counterpart, return ch */ 990othercase(ch) 991int ch; 992{ 993 ch = (uch)ch; 994 assert(isalpha(ch)); 995 if (isupper(ch)) 996 return(tolower(ch)); 997 else if (islower(ch)) 998 return(toupper(ch)); 999 else /* peculiar, but could happen */ 1000 return(ch); 1001} 1002 1003/* 1004 - bothcases - emit a dualcase version of a two-case character 1005 == static void bothcases(register struct parse *p, int ch); 1006 * 1007 * Boy, is this implementation ever a kludge... 1008 */ 1009static void 1010bothcases(p, ch) 1011register struct parse *p; 1012int ch; 1013{ 1014 register char *oldnext = p->next; 1015 register char *oldend = p->end; 1016 char bracket[3]; 1017 1018 ch = (uch)ch; 1019 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1020 p->next = bracket; 1021 p->end = bracket+2; 1022 bracket[0] = ch; 1023 bracket[1] = ']'; 1024 bracket[2] = '\0'; 1025 p_bracket(p); 1026 assert(p->next == bracket+2); 1027 p->next = oldnext; 1028 p->end = oldend; 1029} 1030 1031/* 1032 - ordinary - emit an ordinary character 1033 == static void ordinary(register struct parse *p, register int ch); 1034 */ 1035static void 1036ordinary(p, ch) 1037register struct parse *p; 1038register int ch; 1039{ 1040 register cat_t *cap = p->g->categories; 1041 1042 if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 1043 bothcases(p, ch); 1044 else { 1045 EMIT(OCHAR, (uch)ch); 1046 if (cap[ch] == 0) 1047 cap[ch] = p->g->ncategories++; 1048 } 1049} 1050 1051/* 1052 - nonnewline - emit REG_NEWLINE version of OANY 1053 == static void nonnewline(register struct parse *p); 1054 * 1055 * Boy, is this implementation ever a kludge... 1056 */ 1057static void 1058nonnewline(p) 1059register struct parse *p; 1060{ 1061 register char *oldnext = p->next; 1062 register char *oldend = p->end; 1063 char bracket[4]; 1064 1065 p->next = bracket; 1066 p->end = bracket+3; 1067 bracket[0] = '^'; 1068 bracket[1] = '\n'; 1069 bracket[2] = ']'; 1070 bracket[3] = '\0'; 1071 p_bracket(p); 1072 assert(p->next == bracket+3); 1073 p->next = oldnext; 1074 p->end = oldend; 1075} 1076 1077/* 1078 - repeat - generate code for a bounded repetition, recursively if needed 1079 == static void repeat(register struct parse *p, sopno start, int from, int to); 1080 */ 1081static void 1082repeat(p, start, from, to) 1083register struct parse *p; 1084sopno start; /* operand from here to end of strip */ 1085int from; /* repeated from this number */ 1086int to; /* to this number of times (maybe INFINITY) */ 1087{ 1088 register sopno finish = HERE(); 1089# define N 2 1090# define INF 3 1091# define REP(f, t) ((f)*8 + (t)) 1092# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1093 register sopno copy; 1094 1095 if (p->error != 0) /* head off possible runaway recursion */ 1096 return; 1097 1098 assert(from <= to); 1099 1100 switch (REP(MAP(from), MAP(to))) { 1101 case REP(0, 0): /* must be user doing this */ 1102 DROP(finish-start); /* drop the operand */ 1103 break; 1104 case REP(0, 1): /* as x{1,1}? */ 1105 case REP(0, N): /* as x{1,n}? */ 1106 case REP(0, INF): /* as x{1,}? */ 1107 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1108 INSERT(OCH_, start); /* offset is wrong... */ 1109 repeat(p, start+1, 1, to); 1110 ASTERN(OOR1, start); 1111 AHEAD(start); /* ... fix it */ 1112 EMIT(OOR2, 0); 1113 AHEAD(THERE()); 1114 ASTERN(O_CH, THERETHERE()); 1115 break; 1116 case REP(1, 1): /* trivial case */ 1117 /* done */ 1118 break; 1119 case REP(1, N): /* as x?x{1,n-1} */ 1120 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1121 INSERT(OCH_, start); 1122 ASTERN(OOR1, start); 1123 AHEAD(start); 1124 EMIT(OOR2, 0); /* offset very wrong... */ 1125 AHEAD(THERE()); /* ...so fix it */ 1126 ASTERN(O_CH, THERETHERE()); 1127 copy = dupl(p, start+1, finish+1); 1128 assert(copy == finish+4); 1129 repeat(p, copy, 1, to-1); 1130 break; 1131 case REP(1, INF): /* as x+ */ 1132 INSERT(OPLUS_, start); 1133 ASTERN(O_PLUS, start); 1134 break; 1135 case REP(N, N): /* as xx{m-1,n-1} */ 1136 copy = dupl(p, start, finish); 1137 repeat(p, copy, from-1, to-1); 1138 break; 1139 case REP(N, INF): /* as xx{n-1,INF} */ 1140 copy = dupl(p, start, finish); 1141 repeat(p, copy, from-1, to); 1142 break; 1143 default: /* "can't happen" */ 1144 SETERROR(REG_ASSERT); /* just in case */ 1145 break; 1146 } 1147} 1148 1149/* 1150 - seterr - set an error condition 1151 == static int seterr(register struct parse *p, int e); 1152 */ 1153static int /* useless but makes type checking happy */ 1154seterr(p, e) 1155register struct parse *p; 1156int e; 1157{ 1158 if (p->error == 0) /* keep earliest error condition */ 1159 p->error = e; 1160 p->next = nuls; /* try to bring things to a halt */ 1161 p->end = nuls; 1162 return(0); /* make the return value well-defined */ 1163} 1164 1165/* 1166 - allocset - allocate a set of characters for [] 1167 == static cset *allocset(register struct parse *p); 1168 */ 1169static cset * 1170allocset(p) 1171register struct parse *p; 1172{ 1173 register int no = p->g->ncsets++; 1174 register size_t nc; 1175 register size_t nbytes; 1176 register cset *cs; 1177 register size_t css = (size_t)p->g->csetsize; 1178 register int i; 1179 1180 if (no >= p->ncsalloc) { /* need another column of space */ 1181 p->ncsalloc += CHAR_BIT; 1182 nc = p->ncsalloc; 1183 assert(nc % CHAR_BIT == 0); 1184 nbytes = nc / CHAR_BIT * css; 1185 if (p->g->sets == NULL) 1186 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1187 else 1188 p->g->sets = (cset *)reallocf((char *)p->g->sets, 1189 nc * sizeof(cset)); 1190 if (p->g->setbits == NULL) 1191 p->g->setbits = (uch *)malloc(nbytes); 1192 else { 1193 p->g->setbits = (uch *)reallocf((char *)p->g->setbits, 1194 nbytes); 1195 /* xxx this isn't right if setbits is now NULL */ 1196 for (i = 0; i < no; i++) 1197 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1198 } 1199 if (p->g->sets != NULL && p->g->setbits != NULL) 1200 (void) memset((char *)p->g->setbits + (nbytes - css), 1201 0, css); 1202 else { 1203 no = 0; 1204 SETERROR(REG_ESPACE); 1205 /* caller's responsibility not to do set ops */ 1206 } 1207 } 1208 1209 assert(p->g->sets != NULL); /* xxx */ 1210 cs = &p->g->sets[no]; 1211 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1212 cs->mask = 1 << ((no) % CHAR_BIT); 1213 cs->hash = 0; 1214 cs->smultis = 0; 1215 cs->multis = NULL; 1216 1217 return(cs); 1218} 1219 1220/* 1221 - freeset - free a now-unused set 1222 == static void freeset(register struct parse *p, register cset *cs); 1223 */ 1224static void 1225freeset(p, cs) 1226register struct parse *p; 1227register cset *cs; 1228{ 1229 register int i; 1230 register cset *top = &p->g->sets[p->g->ncsets]; 1231 register size_t css = (size_t)p->g->csetsize; 1232 1233 for (i = 0; i < css; i++) 1234 CHsub(cs, i); 1235 if (cs == top-1) /* recover only the easy case */ 1236 p->g->ncsets--; 1237} 1238 1239/* 1240 - freezeset - final processing on a set of characters 1241 == static int freezeset(register struct parse *p, register cset *cs); 1242 * 1243 * The main task here is merging identical sets. This is usually a waste 1244 * of time (although the hash code minimizes the overhead), but can win 1245 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1246 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1247 * the same value! 1248 */ 1249static int /* set number */ 1250freezeset(p, cs) 1251register struct parse *p; 1252register cset *cs; 1253{ 1254 register short h = cs->hash; 1255 register int i; 1256 register cset *top = &p->g->sets[p->g->ncsets]; 1257 register cset *cs2; 1258 register size_t css = (size_t)p->g->csetsize; 1259 1260 /* look for an earlier one which is the same */ 1261 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1262 if (cs2->hash == h && cs2 != cs) { 1263 /* maybe */ 1264 for (i = 0; i < css; i++) 1265 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1266 break; /* no */ 1267 if (i == css) 1268 break; /* yes */ 1269 } 1270 1271 if (cs2 < top) { /* found one */ 1272 freeset(p, cs); 1273 cs = cs2; 1274 } 1275 1276 return((int)(cs - p->g->sets)); 1277} 1278 1279/* 1280 - firstch - return first character in a set (which must have at least one) 1281 == static int firstch(register struct parse *p, register cset *cs); 1282 */ 1283static int /* character; there is no "none" value */ 1284firstch(p, cs) 1285register struct parse *p; 1286register cset *cs; 1287{ 1288 register int i; 1289 register size_t css = (size_t)p->g->csetsize; 1290 1291 for (i = 0; i < css; i++) 1292 if (CHIN(cs, i)) 1293 return((char)i); 1294 assert(never); 1295 return(0); /* arbitrary */ 1296} 1297 1298/* 1299 - nch - number of characters in a set 1300 == static int nch(register struct parse *p, register cset *cs); 1301 */ 1302static int 1303nch(p, cs) 1304register struct parse *p; 1305register cset *cs; 1306{ 1307 register int i; 1308 register size_t css = (size_t)p->g->csetsize; 1309 register int n = 0; 1310 1311 for (i = 0; i < css; i++) 1312 if (CHIN(cs, i)) 1313 n++; 1314 return(n); 1315} 1316 1317/* 1318 - mcadd - add a collating element to a cset 1319 == static void mcadd(register struct parse *p, register cset *cs, \ 1320 == register char *cp); 1321 */ 1322static void 1323mcadd(p, cs, cp) 1324register struct parse *p; 1325register cset *cs; 1326register char *cp; 1327{ 1328 register size_t oldend = cs->smultis; 1329 1330 cs->smultis += strlen(cp) + 1; 1331 if (cs->multis == NULL) 1332 cs->multis = malloc(cs->smultis); 1333 else 1334 cs->multis = reallocf(cs->multis, cs->smultis); 1335 if (cs->multis == NULL) { 1336 SETERROR(REG_ESPACE); 1337 return; 1338 } 1339 1340 (void) strcpy(cs->multis + oldend - 1, cp); 1341 cs->multis[cs->smultis - 1] = '\0'; 1342} 1343 1344#if used 1345/* 1346 - mcsub - subtract a collating element from a cset 1347 == static void mcsub(register cset *cs, register char *cp); 1348 */ 1349static void 1350mcsub(cs, cp) 1351register cset *cs; 1352register char *cp; 1353{ 1354 register char *fp = mcfind(cs, cp); 1355 register size_t len = strlen(fp); 1356 1357 assert(fp != NULL); 1358 (void) memmove(fp, fp + len + 1, 1359 cs->smultis - (fp + len + 1 - cs->multis)); 1360 cs->smultis -= len; 1361 1362 if (cs->smultis == 0) { 1363 free(cs->multis); 1364 cs->multis = NULL; 1365 return; 1366 } 1367 1368 cs->multis = reallocf(cs->multis, cs->smultis); 1369 assert(cs->multis != NULL); 1370} 1371 1372/* 1373 - mcin - is a collating element in a cset? 1374 == static int mcin(register cset *cs, register char *cp); 1375 */ 1376static int 1377mcin(cs, cp) 1378register cset *cs; 1379register char *cp; 1380{ 1381 return(mcfind(cs, cp) != NULL); 1382} 1383 1384/* 1385 - mcfind - find a collating element in a cset 1386 == static char *mcfind(register cset *cs, register char *cp); 1387 */ 1388static char * 1389mcfind(cs, cp) 1390register cset *cs; 1391register char *cp; 1392{ 1393 register char *p; 1394 1395 if (cs->multis == NULL) 1396 return(NULL); 1397 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1398 if (strcmp(cp, p) == 0) 1399 return(p); 1400 return(NULL); 1401} 1402#endif 1403 1404/* 1405 - mcinvert - invert the list of collating elements in a cset 1406 == static void mcinvert(register struct parse *p, register cset *cs); 1407 * 1408 * This would have to know the set of possibilities. Implementation 1409 * is deferred. 1410 */ 1411static void 1412mcinvert(p, cs) 1413register struct parse *p; 1414register cset *cs; 1415{ 1416 assert(cs->multis == NULL); /* xxx */ 1417} 1418 1419/* 1420 - mccase - add case counterparts of the list of collating elements in a cset 1421 == static void mccase(register struct parse *p, register cset *cs); 1422 * 1423 * This would have to know the set of possibilities. Implementation 1424 * is deferred. 1425 */ 1426static void 1427mccase(p, cs) 1428register struct parse *p; 1429register cset *cs; 1430{ 1431 assert(cs->multis == NULL); /* xxx */ 1432} 1433 1434/* 1435 - isinsets - is this character in any sets? 1436 == static int isinsets(register struct re_guts *g, int c); 1437 */ 1438static int /* predicate */ 1439isinsets(g, c) 1440register struct re_guts *g; 1441int c; 1442{ 1443 register uch *col; 1444 register int i; 1445 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1446 register unsigned uc = (uch)c; 1447 1448 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1449 if (col[uc] != 0) 1450 return(1); 1451 return(0); 1452} 1453 1454/* 1455 - samesets - are these two characters in exactly the same sets? 1456 == static int samesets(register struct re_guts *g, int c1, int c2); 1457 */ 1458static int /* predicate */ 1459samesets(g, c1, c2) 1460register struct re_guts *g; 1461int c1; 1462int c2; 1463{ 1464 register uch *col; 1465 register int i; 1466 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1467 register unsigned uc1 = (uch)c1; 1468 register unsigned uc2 = (uch)c2; 1469 1470 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1471 if (col[uc1] != col[uc2]) 1472 return(0); 1473 return(1); 1474} 1475 1476/* 1477 - categorize - sort out character categories 1478 == static void categorize(struct parse *p, register struct re_guts *g); 1479 */ 1480static void 1481categorize(p, g) 1482struct parse *p; 1483register struct re_guts *g; 1484{ 1485 register cat_t *cats = g->categories; 1486 register int c; 1487 register int c2; 1488 register cat_t cat; 1489 1490 /* avoid making error situations worse */ 1491 if (p->error != 0) 1492 return; 1493 1494 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1495 if (cats[c] == 0 && isinsets(g, c)) { 1496 cat = g->ncategories++; 1497 cats[c] = cat; 1498 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1499 if (cats[c2] == 0 && samesets(g, c, c2)) 1500 cats[c2] = cat; 1501 } 1502} 1503 1504/* 1505 - dupl - emit a duplicate of a bunch of sops 1506 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1507 */ 1508static sopno /* start of duplicate */ 1509dupl(p, start, finish) 1510register struct parse *p; 1511sopno start; /* from here */ 1512sopno finish; /* to this less one */ 1513{ 1514 register sopno ret = HERE(); 1515 register sopno len = finish - start; 1516 1517 assert(finish >= start); 1518 if (len == 0) 1519 return(ret); 1520 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1521 assert(p->ssize >= p->slen + len); 1522 (void) memcpy((char *)(p->strip + p->slen), 1523 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1524 p->slen += len; 1525 return(ret); 1526} 1527 1528/* 1529 - doemit - emit a strip operator 1530 == static void doemit(register struct parse *p, sop op, size_t opnd); 1531 * 1532 * It might seem better to implement this as a macro with a function as 1533 * hard-case backup, but it's just too big and messy unless there are 1534 * some changes to the data structures. Maybe later. 1535 */ 1536static void 1537doemit(p, op, opnd) 1538register struct parse *p; 1539sop op; 1540size_t opnd; 1541{ 1542 /* avoid making error situations worse */ 1543 if (p->error != 0) 1544 return; 1545 1546 /* deal with oversize operands ("can't happen", more or less) */ 1547 assert(opnd < 1<<OPSHIFT); 1548 1549 /* deal with undersized strip */ 1550 if (p->slen >= p->ssize) 1551 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1552 assert(p->slen < p->ssize); 1553 1554 /* finally, it's all reduced to the easy case */ 1555 p->strip[p->slen++] = SOP(op, opnd); 1556} 1557 1558/* 1559 - doinsert - insert a sop into the strip 1560 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1561 */ 1562static void 1563doinsert(p, op, opnd, pos) 1564register struct parse *p; 1565sop op; 1566size_t opnd; 1567sopno pos; 1568{ 1569 register sopno sn; 1570 register sop s; 1571 register int i; 1572 1573 /* avoid making error situations worse */ 1574 if (p->error != 0) 1575 return; 1576 1577 sn = HERE(); 1578 EMIT(op, opnd); /* do checks, ensure space */ 1579 assert(HERE() == sn+1); 1580 s = p->strip[sn]; 1581 1582 /* adjust paren pointers */ 1583 assert(pos > 0); 1584 for (i = 1; i < NPAREN; i++) { 1585 if (p->pbegin[i] >= pos) { 1586 p->pbegin[i]++; 1587 } 1588 if (p->pend[i] >= pos) { 1589 p->pend[i]++; 1590 } 1591 } 1592 1593 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1594 (HERE()-pos-1)*sizeof(sop)); 1595 p->strip[pos] = s; 1596} 1597 1598/* 1599 - dofwd - complete a forward reference 1600 == static void dofwd(register struct parse *p, sopno pos, sop value); 1601 */ 1602static void 1603dofwd(p, pos, value) 1604register struct parse *p; 1605register sopno pos; 1606sop value; 1607{ 1608 /* avoid making error situations worse */ 1609 if (p->error != 0) 1610 return; 1611 1612 assert(value < 1<<OPSHIFT); 1613 p->strip[pos] = OP(p->strip[pos]) | value; 1614} 1615 1616/* 1617 - enlarge - enlarge the strip 1618 == static void enlarge(register struct parse *p, sopno size); 1619 */ 1620static void 1621enlarge(p, size) 1622register struct parse *p; 1623register sopno size; 1624{ 1625 register sop *sp; 1626 1627 if (p->ssize >= size) 1628 return; 1629 1630 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1631 if (sp == NULL) { 1632 SETERROR(REG_ESPACE); 1633 return; 1634 } 1635 p->strip = sp; 1636 p->ssize = size; 1637} 1638 1639/* 1640 - stripsnug - compact the strip 1641 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1642 */ 1643static void 1644stripsnug(p, g) 1645register struct parse *p; 1646register struct re_guts *g; 1647{ 1648 g->nstates = p->slen; 1649 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1650 if (g->strip == NULL) { 1651 SETERROR(REG_ESPACE); 1652 g->strip = p->strip; 1653 } 1654} 1655 1656/* 1657 - findmust - fill in must and mlen with longest mandatory literal string 1658 == static void findmust(register struct parse *p, register struct re_guts *g); 1659 * 1660 * This algorithm could do fancy things like analyzing the operands of | 1661 * for common subsequences. Someday. This code is simple and finds most 1662 * of the interesting cases. 1663 * 1664 * Note that must and mlen got initialized during setup. 1665 */ 1666static void 1667findmust(p, g) 1668struct parse *p; 1669register struct re_guts *g; 1670{ 1671 register sop *scan; 1672 sop *start; 1673 register sop *newstart; 1674 register sopno newlen; 1675 register sop s; 1676 register char *cp; 1677 register sopno i; 1678 1679 /* avoid making error situations worse */ 1680 if (p->error != 0) 1681 return; 1682 1683 /* find the longest OCHAR sequence in strip */ 1684 newlen = 0; 1685 scan = g->strip + 1; 1686 do { 1687 s = *scan++; 1688 switch (OP(s)) { 1689 case OCHAR: /* sequence member */ 1690 if (newlen == 0) /* new sequence */ 1691 newstart = scan - 1; 1692 newlen++; 1693 break; 1694 case OPLUS_: /* things that don't break one */ 1695 case OLPAREN: 1696 case ORPAREN: 1697 break; 1698 case OQUEST_: /* things that must be skipped */ 1699 case OCH_: 1700 scan--; 1701 do { 1702 scan += OPND(s); 1703 s = *scan; 1704 /* assert() interferes w debug printouts */ 1705 if (OP(s) != O_QUEST && OP(s) != O_CH && 1706 OP(s) != OOR2) { 1707 g->iflags |= BAD; 1708 return; 1709 } 1710 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1711 /* fallthrough */ 1712 default: /* things that break a sequence */ 1713 if (newlen > g->mlen) { /* ends one */ 1714 start = newstart; 1715 g->mlen = newlen; 1716 } 1717 newlen = 0; 1718 break; 1719 } 1720 } while (OP(s) != OEND); 1721 1722 if (g->mlen == 0) /* there isn't one */ 1723 return; 1724 1725 /* turn it into a character string */ 1726 g->must = malloc((size_t)g->mlen + 1); 1727 if (g->must == NULL) { /* argh; just forget it */ 1728 g->mlen = 0; 1729 return; 1730 } 1731 cp = g->must; 1732 scan = start; 1733 for (i = g->mlen; i > 0; i--) { 1734 while (OP(s = *scan++) != OCHAR) 1735 continue; 1736 assert(cp < g->must + g->mlen); 1737 *cp++ = (char)OPND(s); 1738 } 1739 assert(cp == g->must + g->mlen); 1740 *cp++ = '\0'; /* just on general principles */ 1741} 1742 1743/*
| 283 g->nplus = pluscount(p, g); 284 g->magic = MAGIC2; 285 preg->re_nsub = g->nsub; 286 preg->re_g = g; 287 preg->re_magic = MAGIC1; 288#ifndef REDEBUG 289 /* not debugging, so can't rely on the assert() in regexec() */ 290 if (g->iflags&BAD) 291 SETERROR(REG_ASSERT); 292#endif 293 294 /* win or lose, we're done */ 295 if (p->error != 0) /* lose */ 296 regfree(preg); 297 return(p->error); 298} 299 300/* 301 - p_ere - ERE parser top level, concatenation and alternation 302 == static void p_ere(register struct parse *p, int stop); 303 */ 304static void 305p_ere(p, stop) 306register struct parse *p; 307int stop; /* character this ERE should end at */ 308{ 309 register char c; 310 register sopno prevback; 311 register sopno prevfwd; 312 register sopno conc; 313 register int first = 1; /* is this the first alternative? */ 314 315 for (;;) { 316 /* do a bunch of concatenated expressions */ 317 conc = HERE(); 318 while (MORE() && (c = PEEK()) != '|' && c != stop) 319 p_ere_exp(p); 320 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 321 322 if (!EAT('|')) 323 break; /* NOTE BREAK OUT */ 324 325 if (first) { 326 INSERT(OCH_, conc); /* offset is wrong */ 327 prevfwd = conc; 328 prevback = conc; 329 first = 0; 330 } 331 ASTERN(OOR1, prevback); 332 prevback = THERE(); 333 AHEAD(prevfwd); /* fix previous offset */ 334 prevfwd = HERE(); 335 EMIT(OOR2, 0); /* offset is very wrong */ 336 } 337 338 if (!first) { /* tail-end fixups */ 339 AHEAD(prevfwd); 340 ASTERN(O_CH, prevback); 341 } 342 343 assert(!MORE() || SEE(stop)); 344} 345 346/* 347 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 348 == static void p_ere_exp(register struct parse *p); 349 */ 350static void 351p_ere_exp(p) 352register struct parse *p; 353{ 354 register char c; 355 register sopno pos; 356 register int count; 357 register int count2; 358 register sopno subno; 359 int wascaret = 0; 360 361 assert(MORE()); /* caller should have ensured this */ 362 c = GETNEXT(); 363 364 pos = HERE(); 365 switch (c) { 366 case '(': 367 (void)REQUIRE(MORE(), REG_EPAREN); 368 p->g->nsub++; 369 subno = p->g->nsub; 370 if (subno < NPAREN) 371 p->pbegin[subno] = HERE(); 372 EMIT(OLPAREN, subno); 373 if (!SEE(')')) 374 p_ere(p, ')'); 375 if (subno < NPAREN) { 376 p->pend[subno] = HERE(); 377 assert(p->pend[subno] != 0); 378 } 379 EMIT(ORPAREN, subno); 380 (void)MUSTEAT(')', REG_EPAREN); 381 break; 382#ifndef POSIX_MISTAKE 383 case ')': /* happens only if no current unmatched ( */ 384 /* 385 * You may ask, why the ifndef? Because I didn't notice 386 * this until slightly too late for 1003.2, and none of the 387 * other 1003.2 regular-expression reviewers noticed it at 388 * all. So an unmatched ) is legal POSIX, at least until 389 * we can get it fixed. 390 */ 391 SETERROR(REG_EPAREN); 392 break; 393#endif 394 case '^': 395 EMIT(OBOL, 0); 396 p->g->iflags |= USEBOL; 397 p->g->nbol++; 398 wascaret = 1; 399 break; 400 case '$': 401 EMIT(OEOL, 0); 402 p->g->iflags |= USEEOL; 403 p->g->neol++; 404 break; 405 case '|': 406 SETERROR(REG_EMPTY); 407 break; 408 case '*': 409 case '+': 410 case '?': 411 SETERROR(REG_BADRPT); 412 break; 413 case '.': 414 if (p->g->cflags®_NEWLINE) 415 nonnewline(p); 416 else 417 EMIT(OANY, 0); 418 break; 419 case '[': 420 p_bracket(p); 421 break; 422 case '\\': 423 (void)REQUIRE(MORE(), REG_EESCAPE); 424 c = GETNEXT(); 425 ordinary(p, c); 426 break; 427 case '{': /* okay as ordinary except if digit follows */ 428 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 429 /* FALLTHROUGH */ 430 default: 431 ordinary(p, c); 432 break; 433 } 434 435 if (!MORE()) 436 return; 437 c = PEEK(); 438 /* we call { a repetition if followed by a digit */ 439 if (!( c == '*' || c == '+' || c == '?' || 440 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 441 return; /* no repetition, we're done */ 442 NEXT(); 443 444 (void)REQUIRE(!wascaret, REG_BADRPT); 445 switch (c) { 446 case '*': /* implemented as +? */ 447 /* this case does not require the (y|) trick, noKLUDGE */ 448 INSERT(OPLUS_, pos); 449 ASTERN(O_PLUS, pos); 450 INSERT(OQUEST_, pos); 451 ASTERN(O_QUEST, pos); 452 break; 453 case '+': 454 INSERT(OPLUS_, pos); 455 ASTERN(O_PLUS, pos); 456 break; 457 case '?': 458 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 459 INSERT(OCH_, pos); /* offset slightly wrong */ 460 ASTERN(OOR1, pos); /* this one's right */ 461 AHEAD(pos); /* fix the OCH_ */ 462 EMIT(OOR2, 0); /* offset very wrong... */ 463 AHEAD(THERE()); /* ...so fix it */ 464 ASTERN(O_CH, THERETHERE()); 465 break; 466 case '{': 467 count = p_count(p); 468 if (EAT(',')) { 469 if (isdigit((uch)PEEK())) { 470 count2 = p_count(p); 471 (void)REQUIRE(count <= count2, REG_BADBR); 472 } else /* single number with comma */ 473 count2 = INFINITY; 474 } else /* just a single number */ 475 count2 = count; 476 repeat(p, pos, count, count2); 477 if (!EAT('}')) { /* error heuristics */ 478 while (MORE() && PEEK() != '}') 479 NEXT(); 480 (void)REQUIRE(MORE(), REG_EBRACE); 481 SETERROR(REG_BADBR); 482 } 483 break; 484 } 485 486 if (!MORE()) 487 return; 488 c = PEEK(); 489 if (!( c == '*' || c == '+' || c == '?' || 490 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 491 return; 492 SETERROR(REG_BADRPT); 493} 494 495/* 496 - p_str - string (no metacharacters) "parser" 497 == static void p_str(register struct parse *p); 498 */ 499static void 500p_str(p) 501register struct parse *p; 502{ 503 (void)REQUIRE(MORE(), REG_EMPTY); 504 while (MORE()) 505 ordinary(p, GETNEXT()); 506} 507 508/* 509 - p_bre - BRE parser top level, anchoring and concatenation 510 == static void p_bre(register struct parse *p, register int end1, \ 511 == register int end2); 512 * Giving end1 as OUT essentially eliminates the end1/end2 check. 513 * 514 * This implementation is a bit of a kludge, in that a trailing $ is first 515 * taken as an ordinary character and then revised to be an anchor. The 516 * only undesirable side effect is that '$' gets included as a character 517 * category in such cases. This is fairly harmless; not worth fixing. 518 * The amount of lookahead needed to avoid this kludge is excessive. 519 */ 520static void 521p_bre(p, end1, end2) 522register struct parse *p; 523register int end1; /* first terminating character */ 524register int end2; /* second terminating character */ 525{ 526 register sopno start = HERE(); 527 register int first = 1; /* first subexpression? */ 528 register int wasdollar = 0; 529 530 if (EAT('^')) { 531 EMIT(OBOL, 0); 532 p->g->iflags |= USEBOL; 533 p->g->nbol++; 534 } 535 while (MORE() && !SEETWO(end1, end2)) { 536 wasdollar = p_simp_re(p, first); 537 first = 0; 538 } 539 if (wasdollar) { /* oops, that was a trailing anchor */ 540 DROP(1); 541 EMIT(OEOL, 0); 542 p->g->iflags |= USEEOL; 543 p->g->neol++; 544 } 545 546 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 547} 548 549/* 550 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 551 == static int p_simp_re(register struct parse *p, int starordinary); 552 */ 553static int /* was the simple RE an unbackslashed $? */ 554p_simp_re(p, starordinary) 555register struct parse *p; 556int starordinary; /* is a leading * an ordinary character? */ 557{ 558 register int c; 559 register int count; 560 register int count2; 561 register sopno pos; 562 register int i; 563 register sopno subno; 564# define BACKSL (1<<CHAR_BIT) 565 566 pos = HERE(); /* repetion op, if any, covers from here */ 567 568 assert(MORE()); /* caller should have ensured this */ 569 c = GETNEXT(); 570 if (c == '\\') { 571 (void)REQUIRE(MORE(), REG_EESCAPE); 572 c = BACKSL | GETNEXT(); 573 } 574 switch (c) { 575 case '.': 576 if (p->g->cflags®_NEWLINE) 577 nonnewline(p); 578 else 579 EMIT(OANY, 0); 580 break; 581 case '[': 582 p_bracket(p); 583 break; 584 case BACKSL|'{': 585 SETERROR(REG_BADRPT); 586 break; 587 case BACKSL|'(': 588 p->g->nsub++; 589 subno = p->g->nsub; 590 if (subno < NPAREN) 591 p->pbegin[subno] = HERE(); 592 EMIT(OLPAREN, subno); 593 /* the MORE here is an error heuristic */ 594 if (MORE() && !SEETWO('\\', ')')) 595 p_bre(p, '\\', ')'); 596 if (subno < NPAREN) { 597 p->pend[subno] = HERE(); 598 assert(p->pend[subno] != 0); 599 } 600 EMIT(ORPAREN, subno); 601 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 602 break; 603 case BACKSL|')': /* should not get here -- must be user */ 604 case BACKSL|'}': 605 SETERROR(REG_EPAREN); 606 break; 607 case BACKSL|'1': 608 case BACKSL|'2': 609 case BACKSL|'3': 610 case BACKSL|'4': 611 case BACKSL|'5': 612 case BACKSL|'6': 613 case BACKSL|'7': 614 case BACKSL|'8': 615 case BACKSL|'9': 616 i = (c&~BACKSL) - '0'; 617 assert(i < NPAREN); 618 if (p->pend[i] != 0) { 619 assert(i <= p->g->nsub); 620 EMIT(OBACK_, i); 621 assert(p->pbegin[i] != 0); 622 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 623 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 624 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 625 EMIT(O_BACK, i); 626 } else 627 SETERROR(REG_ESUBREG); 628 p->g->backrefs = 1; 629 break; 630 case '*': 631 (void)REQUIRE(starordinary, REG_BADRPT); 632 /* FALLTHROUGH */ 633 default: 634 ordinary(p, (char)c); 635 break; 636 } 637 638 if (EAT('*')) { /* implemented as +? */ 639 /* this case does not require the (y|) trick, noKLUDGE */ 640 INSERT(OPLUS_, pos); 641 ASTERN(O_PLUS, pos); 642 INSERT(OQUEST_, pos); 643 ASTERN(O_QUEST, pos); 644 } else if (EATTWO('\\', '{')) { 645 count = p_count(p); 646 if (EAT(',')) { 647 if (MORE() && isdigit((uch)PEEK())) { 648 count2 = p_count(p); 649 (void)REQUIRE(count <= count2, REG_BADBR); 650 } else /* single number with comma */ 651 count2 = INFINITY; 652 } else /* just a single number */ 653 count2 = count; 654 repeat(p, pos, count, count2); 655 if (!EATTWO('\\', '}')) { /* error heuristics */ 656 while (MORE() && !SEETWO('\\', '}')) 657 NEXT(); 658 (void)REQUIRE(MORE(), REG_EBRACE); 659 SETERROR(REG_BADBR); 660 } 661 } else if (c == '$') /* $ (but not \$) ends it */ 662 return(1); 663 664 return(0); 665} 666 667/* 668 - p_count - parse a repetition count 669 == static int p_count(register struct parse *p); 670 */ 671static int /* the value */ 672p_count(p) 673register struct parse *p; 674{ 675 register int count = 0; 676 register int ndigits = 0; 677 678 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 679 count = count*10 + (GETNEXT() - '0'); 680 ndigits++; 681 } 682 683 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 684 return(count); 685} 686 687/* 688 - p_bracket - parse a bracketed character list 689 == static void p_bracket(register struct parse *p); 690 * 691 * Note a significant property of this code: if the allocset() did SETERROR, 692 * no set operations are done. 693 */ 694static void 695p_bracket(p) 696register struct parse *p; 697{ 698 register cset *cs = allocset(p); 699 register int invert = 0; 700 701 /* Dept of Truly Sickening Special-Case Kludges */ 702 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 703 EMIT(OBOW, 0); 704 NEXTn(6); 705 return; 706 } 707 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 708 EMIT(OEOW, 0); 709 NEXTn(6); 710 return; 711 } 712 713 if (EAT('^')) 714 invert++; /* make note to invert set at end */ 715 if (EAT(']')) 716 CHadd(cs, ']'); 717 else if (EAT('-')) 718 CHadd(cs, '-'); 719 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 720 p_b_term(p, cs); 721 if (EAT('-')) 722 CHadd(cs, '-'); 723 (void)MUSTEAT(']', REG_EBRACK); 724 725 if (p->error != 0) /* don't mess things up further */ 726 return; 727 728 if (p->g->cflags®_ICASE) { 729 register int i; 730 register int ci; 731 732 for (i = p->g->csetsize - 1; i >= 0; i--) 733 if (CHIN(cs, i) && isalpha(i)) { 734 ci = othercase(i); 735 if (ci != i) 736 CHadd(cs, ci); 737 } 738 if (cs->multis != NULL) 739 mccase(p, cs); 740 } 741 if (invert) { 742 register int i; 743 744 for (i = p->g->csetsize - 1; i >= 0; i--) 745 if (CHIN(cs, i)) 746 CHsub(cs, i); 747 else 748 CHadd(cs, i); 749 if (p->g->cflags®_NEWLINE) 750 CHsub(cs, '\n'); 751 if (cs->multis != NULL) 752 mcinvert(p, cs); 753 } 754 755 assert(cs->multis == NULL); /* xxx */ 756 757 if (nch(p, cs) == 1) { /* optimize singleton sets */ 758 ordinary(p, firstch(p, cs)); 759 freeset(p, cs); 760 } else 761 EMIT(OANYOF, freezeset(p, cs)); 762} 763 764/* 765 - p_b_term - parse one term of a bracketed character list 766 == static void p_b_term(register struct parse *p, register cset *cs); 767 */ 768static void 769p_b_term(p, cs) 770register struct parse *p; 771register cset *cs; 772{ 773 register char c; 774 register char start, finish; 775 register int i; 776 777 /* classify what we've got */ 778 switch ((MORE()) ? PEEK() : '\0') { 779 case '[': 780 c = (MORE2()) ? PEEK2() : '\0'; 781 break; 782 case '-': 783 SETERROR(REG_ERANGE); 784 return; /* NOTE RETURN */ 785 break; 786 default: 787 c = '\0'; 788 break; 789 } 790 791 switch (c) { 792 case ':': /* character class */ 793 NEXT2(); 794 (void)REQUIRE(MORE(), REG_EBRACK); 795 c = PEEK(); 796 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 797 p_b_cclass(p, cs); 798 (void)REQUIRE(MORE(), REG_EBRACK); 799 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 800 break; 801 case '=': /* equivalence class */ 802 NEXT2(); 803 (void)REQUIRE(MORE(), REG_EBRACK); 804 c = PEEK(); 805 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 806 p_b_eclass(p, cs); 807 (void)REQUIRE(MORE(), REG_EBRACK); 808 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 809 break; 810 default: /* symbol, ordinary character, or range */ 811/* xxx revision needed for multichar stuff */ 812 start = p_b_symbol(p); 813 if (SEE('-') && MORE2() && PEEK2() != ']') { 814 /* range */ 815 NEXT(); 816 if (EAT('-')) 817 finish = '-'; 818 else 819 finish = p_b_symbol(p); 820 } else 821 finish = start; 822 if (start == finish) 823 CHadd(cs, start); 824 else { 825 if (__collate_load_error) { 826 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 827 for (i = (uch)start; i <= (uch)finish; i++) 828 CHadd(cs, i); 829 } else { 830 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 831 for (i = CHAR_MIN; i <= CHAR_MAX; i++) { 832 if ( __collate_range_cmp(start, i) <= 0 833 && __collate_range_cmp(i, finish) <= 0 834 ) 835 CHadd(cs, i); 836 } 837 } 838 } 839 break; 840 } 841} 842 843/* 844 - p_b_cclass - parse a character-class name and deal with it 845 == static void p_b_cclass(register struct parse *p, register cset *cs); 846 */ 847static void 848p_b_cclass(p, cs) 849register struct parse *p; 850register cset *cs; 851{ 852 register int c; 853 register char *sp = p->next; 854 register struct cclass *cp; 855 register size_t len; 856 857 while (MORE() && isalpha((uch)PEEK())) 858 NEXT(); 859 len = p->next - sp; 860 for (cp = cclasses; cp->name != NULL; cp++) 861 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 862 break; 863 if (cp->name == NULL) { 864 /* oops, didn't find it */ 865 SETERROR(REG_ECTYPE); 866 return; 867 } 868 869 switch (cp->fidx) { 870 case CALNUM: 871 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 872 if (isalnum((uch)c)) 873 CHadd(cs, c); 874 break; 875 case CALPHA: 876 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 877 if (isalpha((uch)c)) 878 CHadd(cs, c); 879 break; 880 case CBLANK: 881 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 882 if (isblank((uch)c)) 883 CHadd(cs, c); 884 break; 885 case CCNTRL: 886 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 887 if (iscntrl((uch)c)) 888 CHadd(cs, c); 889 break; 890 case CDIGIT: 891 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 892 if (isdigit((uch)c)) 893 CHadd(cs, c); 894 break; 895 case CGRAPH: 896 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 897 if (isgraph((uch)c)) 898 CHadd(cs, c); 899 break; 900 case CLOWER: 901 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 902 if (islower((uch)c)) 903 CHadd(cs, c); 904 break; 905 case CPRINT: 906 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 907 if (isprint((uch)c)) 908 CHadd(cs, c); 909 break; 910 case CPUNCT: 911 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 912 if (ispunct((uch)c)) 913 CHadd(cs, c); 914 break; 915 case CSPACE: 916 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 917 if (isspace((uch)c)) 918 CHadd(cs, c); 919 break; 920 case CUPPER: 921 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 922 if (isupper((uch)c)) 923 CHadd(cs, c); 924 break; 925 case CXDIGIT: 926 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 927 if (isxdigit((uch)c)) 928 CHadd(cs, c); 929 break; 930 } 931#if 0 932 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 933 MCadd(p, cs, u); 934#endif 935} 936 937/* 938 - p_b_eclass - parse an equivalence-class name and deal with it 939 == static void p_b_eclass(register struct parse *p, register cset *cs); 940 * 941 * This implementation is incomplete. xxx 942 */ 943static void 944p_b_eclass(p, cs) 945register struct parse *p; 946register cset *cs; 947{ 948 register char c; 949 950 c = p_b_coll_elem(p, '='); 951 CHadd(cs, c); 952} 953 954/* 955 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 956 == static char p_b_symbol(register struct parse *p); 957 */ 958static char /* value of symbol */ 959p_b_symbol(p) 960register struct parse *p; 961{ 962 register char value; 963 964 (void)REQUIRE(MORE(), REG_EBRACK); 965 if (!EATTWO('[', '.')) 966 return(GETNEXT()); 967 968 /* collating symbol */ 969 value = p_b_coll_elem(p, '.'); 970 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 971 return(value); 972} 973 974/* 975 - p_b_coll_elem - parse a collating-element name and look it up 976 == static char p_b_coll_elem(register struct parse *p, int endc); 977 */ 978static char /* value of collating element */ 979p_b_coll_elem(p, endc) 980register struct parse *p; 981int endc; /* name ended by endc,']' */ 982{ 983 register char *sp = p->next; 984 register struct cname *cp; 985 register int len; 986 987 while (MORE() && !SEETWO(endc, ']')) 988 NEXT(); 989 if (!MORE()) { 990 SETERROR(REG_EBRACK); 991 return(0); 992 } 993 len = p->next - sp; 994 for (cp = cnames; cp->name != NULL; cp++) 995 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 996 return(cp->code); /* known name */ 997 if (len == 1) 998 return(*sp); /* single character */ 999 SETERROR(REG_ECOLLATE); /* neither */ 1000 return(0); 1001} 1002 1003/* 1004 - othercase - return the case counterpart of an alphabetic 1005 == static char othercase(int ch); 1006 */ 1007static char /* if no counterpart, return ch */ 1008othercase(ch) 1009int ch; 1010{ 1011 ch = (uch)ch; 1012 assert(isalpha(ch)); 1013 if (isupper(ch)) 1014 return(tolower(ch)); 1015 else if (islower(ch)) 1016 return(toupper(ch)); 1017 else /* peculiar, but could happen */ 1018 return(ch); 1019} 1020 1021/* 1022 - bothcases - emit a dualcase version of a two-case character 1023 == static void bothcases(register struct parse *p, int ch); 1024 * 1025 * Boy, is this implementation ever a kludge... 1026 */ 1027static void 1028bothcases(p, ch) 1029register struct parse *p; 1030int ch; 1031{ 1032 register char *oldnext = p->next; 1033 register char *oldend = p->end; 1034 char bracket[3]; 1035 1036 ch = (uch)ch; 1037 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1038 p->next = bracket; 1039 p->end = bracket+2; 1040 bracket[0] = ch; 1041 bracket[1] = ']'; 1042 bracket[2] = '\0'; 1043 p_bracket(p); 1044 assert(p->next == bracket+2); 1045 p->next = oldnext; 1046 p->end = oldend; 1047} 1048 1049/* 1050 - ordinary - emit an ordinary character 1051 == static void ordinary(register struct parse *p, register int ch); 1052 */ 1053static void 1054ordinary(p, ch) 1055register struct parse *p; 1056register int ch; 1057{ 1058 register cat_t *cap = p->g->categories; 1059 1060 if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 1061 bothcases(p, ch); 1062 else { 1063 EMIT(OCHAR, (uch)ch); 1064 if (cap[ch] == 0) 1065 cap[ch] = p->g->ncategories++; 1066 } 1067} 1068 1069/* 1070 - nonnewline - emit REG_NEWLINE version of OANY 1071 == static void nonnewline(register struct parse *p); 1072 * 1073 * Boy, is this implementation ever a kludge... 1074 */ 1075static void 1076nonnewline(p) 1077register struct parse *p; 1078{ 1079 register char *oldnext = p->next; 1080 register char *oldend = p->end; 1081 char bracket[4]; 1082 1083 p->next = bracket; 1084 p->end = bracket+3; 1085 bracket[0] = '^'; 1086 bracket[1] = '\n'; 1087 bracket[2] = ']'; 1088 bracket[3] = '\0'; 1089 p_bracket(p); 1090 assert(p->next == bracket+3); 1091 p->next = oldnext; 1092 p->end = oldend; 1093} 1094 1095/* 1096 - repeat - generate code for a bounded repetition, recursively if needed 1097 == static void repeat(register struct parse *p, sopno start, int from, int to); 1098 */ 1099static void 1100repeat(p, start, from, to) 1101register struct parse *p; 1102sopno start; /* operand from here to end of strip */ 1103int from; /* repeated from this number */ 1104int to; /* to this number of times (maybe INFINITY) */ 1105{ 1106 register sopno finish = HERE(); 1107# define N 2 1108# define INF 3 1109# define REP(f, t) ((f)*8 + (t)) 1110# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1111 register sopno copy; 1112 1113 if (p->error != 0) /* head off possible runaway recursion */ 1114 return; 1115 1116 assert(from <= to); 1117 1118 switch (REP(MAP(from), MAP(to))) { 1119 case REP(0, 0): /* must be user doing this */ 1120 DROP(finish-start); /* drop the operand */ 1121 break; 1122 case REP(0, 1): /* as x{1,1}? */ 1123 case REP(0, N): /* as x{1,n}? */ 1124 case REP(0, INF): /* as x{1,}? */ 1125 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1126 INSERT(OCH_, start); /* offset is wrong... */ 1127 repeat(p, start+1, 1, to); 1128 ASTERN(OOR1, start); 1129 AHEAD(start); /* ... fix it */ 1130 EMIT(OOR2, 0); 1131 AHEAD(THERE()); 1132 ASTERN(O_CH, THERETHERE()); 1133 break; 1134 case REP(1, 1): /* trivial case */ 1135 /* done */ 1136 break; 1137 case REP(1, N): /* as x?x{1,n-1} */ 1138 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1139 INSERT(OCH_, start); 1140 ASTERN(OOR1, start); 1141 AHEAD(start); 1142 EMIT(OOR2, 0); /* offset very wrong... */ 1143 AHEAD(THERE()); /* ...so fix it */ 1144 ASTERN(O_CH, THERETHERE()); 1145 copy = dupl(p, start+1, finish+1); 1146 assert(copy == finish+4); 1147 repeat(p, copy, 1, to-1); 1148 break; 1149 case REP(1, INF): /* as x+ */ 1150 INSERT(OPLUS_, start); 1151 ASTERN(O_PLUS, start); 1152 break; 1153 case REP(N, N): /* as xx{m-1,n-1} */ 1154 copy = dupl(p, start, finish); 1155 repeat(p, copy, from-1, to-1); 1156 break; 1157 case REP(N, INF): /* as xx{n-1,INF} */ 1158 copy = dupl(p, start, finish); 1159 repeat(p, copy, from-1, to); 1160 break; 1161 default: /* "can't happen" */ 1162 SETERROR(REG_ASSERT); /* just in case */ 1163 break; 1164 } 1165} 1166 1167/* 1168 - seterr - set an error condition 1169 == static int seterr(register struct parse *p, int e); 1170 */ 1171static int /* useless but makes type checking happy */ 1172seterr(p, e) 1173register struct parse *p; 1174int e; 1175{ 1176 if (p->error == 0) /* keep earliest error condition */ 1177 p->error = e; 1178 p->next = nuls; /* try to bring things to a halt */ 1179 p->end = nuls; 1180 return(0); /* make the return value well-defined */ 1181} 1182 1183/* 1184 - allocset - allocate a set of characters for [] 1185 == static cset *allocset(register struct parse *p); 1186 */ 1187static cset * 1188allocset(p) 1189register struct parse *p; 1190{ 1191 register int no = p->g->ncsets++; 1192 register size_t nc; 1193 register size_t nbytes; 1194 register cset *cs; 1195 register size_t css = (size_t)p->g->csetsize; 1196 register int i; 1197 1198 if (no >= p->ncsalloc) { /* need another column of space */ 1199 p->ncsalloc += CHAR_BIT; 1200 nc = p->ncsalloc; 1201 assert(nc % CHAR_BIT == 0); 1202 nbytes = nc / CHAR_BIT * css; 1203 if (p->g->sets == NULL) 1204 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1205 else 1206 p->g->sets = (cset *)reallocf((char *)p->g->sets, 1207 nc * sizeof(cset)); 1208 if (p->g->setbits == NULL) 1209 p->g->setbits = (uch *)malloc(nbytes); 1210 else { 1211 p->g->setbits = (uch *)reallocf((char *)p->g->setbits, 1212 nbytes); 1213 /* xxx this isn't right if setbits is now NULL */ 1214 for (i = 0; i < no; i++) 1215 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1216 } 1217 if (p->g->sets != NULL && p->g->setbits != NULL) 1218 (void) memset((char *)p->g->setbits + (nbytes - css), 1219 0, css); 1220 else { 1221 no = 0; 1222 SETERROR(REG_ESPACE); 1223 /* caller's responsibility not to do set ops */ 1224 } 1225 } 1226 1227 assert(p->g->sets != NULL); /* xxx */ 1228 cs = &p->g->sets[no]; 1229 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1230 cs->mask = 1 << ((no) % CHAR_BIT); 1231 cs->hash = 0; 1232 cs->smultis = 0; 1233 cs->multis = NULL; 1234 1235 return(cs); 1236} 1237 1238/* 1239 - freeset - free a now-unused set 1240 == static void freeset(register struct parse *p, register cset *cs); 1241 */ 1242static void 1243freeset(p, cs) 1244register struct parse *p; 1245register cset *cs; 1246{ 1247 register int i; 1248 register cset *top = &p->g->sets[p->g->ncsets]; 1249 register size_t css = (size_t)p->g->csetsize; 1250 1251 for (i = 0; i < css; i++) 1252 CHsub(cs, i); 1253 if (cs == top-1) /* recover only the easy case */ 1254 p->g->ncsets--; 1255} 1256 1257/* 1258 - freezeset - final processing on a set of characters 1259 == static int freezeset(register struct parse *p, register cset *cs); 1260 * 1261 * The main task here is merging identical sets. This is usually a waste 1262 * of time (although the hash code minimizes the overhead), but can win 1263 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1264 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1265 * the same value! 1266 */ 1267static int /* set number */ 1268freezeset(p, cs) 1269register struct parse *p; 1270register cset *cs; 1271{ 1272 register short h = cs->hash; 1273 register int i; 1274 register cset *top = &p->g->sets[p->g->ncsets]; 1275 register cset *cs2; 1276 register size_t css = (size_t)p->g->csetsize; 1277 1278 /* look for an earlier one which is the same */ 1279 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1280 if (cs2->hash == h && cs2 != cs) { 1281 /* maybe */ 1282 for (i = 0; i < css; i++) 1283 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1284 break; /* no */ 1285 if (i == css) 1286 break; /* yes */ 1287 } 1288 1289 if (cs2 < top) { /* found one */ 1290 freeset(p, cs); 1291 cs = cs2; 1292 } 1293 1294 return((int)(cs - p->g->sets)); 1295} 1296 1297/* 1298 - firstch - return first character in a set (which must have at least one) 1299 == static int firstch(register struct parse *p, register cset *cs); 1300 */ 1301static int /* character; there is no "none" value */ 1302firstch(p, cs) 1303register struct parse *p; 1304register cset *cs; 1305{ 1306 register int i; 1307 register size_t css = (size_t)p->g->csetsize; 1308 1309 for (i = 0; i < css; i++) 1310 if (CHIN(cs, i)) 1311 return((char)i); 1312 assert(never); 1313 return(0); /* arbitrary */ 1314} 1315 1316/* 1317 - nch - number of characters in a set 1318 == static int nch(register struct parse *p, register cset *cs); 1319 */ 1320static int 1321nch(p, cs) 1322register struct parse *p; 1323register cset *cs; 1324{ 1325 register int i; 1326 register size_t css = (size_t)p->g->csetsize; 1327 register int n = 0; 1328 1329 for (i = 0; i < css; i++) 1330 if (CHIN(cs, i)) 1331 n++; 1332 return(n); 1333} 1334 1335/* 1336 - mcadd - add a collating element to a cset 1337 == static void mcadd(register struct parse *p, register cset *cs, \ 1338 == register char *cp); 1339 */ 1340static void 1341mcadd(p, cs, cp) 1342register struct parse *p; 1343register cset *cs; 1344register char *cp; 1345{ 1346 register size_t oldend = cs->smultis; 1347 1348 cs->smultis += strlen(cp) + 1; 1349 if (cs->multis == NULL) 1350 cs->multis = malloc(cs->smultis); 1351 else 1352 cs->multis = reallocf(cs->multis, cs->smultis); 1353 if (cs->multis == NULL) { 1354 SETERROR(REG_ESPACE); 1355 return; 1356 } 1357 1358 (void) strcpy(cs->multis + oldend - 1, cp); 1359 cs->multis[cs->smultis - 1] = '\0'; 1360} 1361 1362#if used 1363/* 1364 - mcsub - subtract a collating element from a cset 1365 == static void mcsub(register cset *cs, register char *cp); 1366 */ 1367static void 1368mcsub(cs, cp) 1369register cset *cs; 1370register char *cp; 1371{ 1372 register char *fp = mcfind(cs, cp); 1373 register size_t len = strlen(fp); 1374 1375 assert(fp != NULL); 1376 (void) memmove(fp, fp + len + 1, 1377 cs->smultis - (fp + len + 1 - cs->multis)); 1378 cs->smultis -= len; 1379 1380 if (cs->smultis == 0) { 1381 free(cs->multis); 1382 cs->multis = NULL; 1383 return; 1384 } 1385 1386 cs->multis = reallocf(cs->multis, cs->smultis); 1387 assert(cs->multis != NULL); 1388} 1389 1390/* 1391 - mcin - is a collating element in a cset? 1392 == static int mcin(register cset *cs, register char *cp); 1393 */ 1394static int 1395mcin(cs, cp) 1396register cset *cs; 1397register char *cp; 1398{ 1399 return(mcfind(cs, cp) != NULL); 1400} 1401 1402/* 1403 - mcfind - find a collating element in a cset 1404 == static char *mcfind(register cset *cs, register char *cp); 1405 */ 1406static char * 1407mcfind(cs, cp) 1408register cset *cs; 1409register char *cp; 1410{ 1411 register char *p; 1412 1413 if (cs->multis == NULL) 1414 return(NULL); 1415 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1416 if (strcmp(cp, p) == 0) 1417 return(p); 1418 return(NULL); 1419} 1420#endif 1421 1422/* 1423 - mcinvert - invert the list of collating elements in a cset 1424 == static void mcinvert(register struct parse *p, register cset *cs); 1425 * 1426 * This would have to know the set of possibilities. Implementation 1427 * is deferred. 1428 */ 1429static void 1430mcinvert(p, cs) 1431register struct parse *p; 1432register cset *cs; 1433{ 1434 assert(cs->multis == NULL); /* xxx */ 1435} 1436 1437/* 1438 - mccase - add case counterparts of the list of collating elements in a cset 1439 == static void mccase(register struct parse *p, register cset *cs); 1440 * 1441 * This would have to know the set of possibilities. Implementation 1442 * is deferred. 1443 */ 1444static void 1445mccase(p, cs) 1446register struct parse *p; 1447register cset *cs; 1448{ 1449 assert(cs->multis == NULL); /* xxx */ 1450} 1451 1452/* 1453 - isinsets - is this character in any sets? 1454 == static int isinsets(register struct re_guts *g, int c); 1455 */ 1456static int /* predicate */ 1457isinsets(g, c) 1458register struct re_guts *g; 1459int c; 1460{ 1461 register uch *col; 1462 register int i; 1463 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1464 register unsigned uc = (uch)c; 1465 1466 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1467 if (col[uc] != 0) 1468 return(1); 1469 return(0); 1470} 1471 1472/* 1473 - samesets - are these two characters in exactly the same sets? 1474 == static int samesets(register struct re_guts *g, int c1, int c2); 1475 */ 1476static int /* predicate */ 1477samesets(g, c1, c2) 1478register struct re_guts *g; 1479int c1; 1480int c2; 1481{ 1482 register uch *col; 1483 register int i; 1484 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1485 register unsigned uc1 = (uch)c1; 1486 register unsigned uc2 = (uch)c2; 1487 1488 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1489 if (col[uc1] != col[uc2]) 1490 return(0); 1491 return(1); 1492} 1493 1494/* 1495 - categorize - sort out character categories 1496 == static void categorize(struct parse *p, register struct re_guts *g); 1497 */ 1498static void 1499categorize(p, g) 1500struct parse *p; 1501register struct re_guts *g; 1502{ 1503 register cat_t *cats = g->categories; 1504 register int c; 1505 register int c2; 1506 register cat_t cat; 1507 1508 /* avoid making error situations worse */ 1509 if (p->error != 0) 1510 return; 1511 1512 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1513 if (cats[c] == 0 && isinsets(g, c)) { 1514 cat = g->ncategories++; 1515 cats[c] = cat; 1516 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1517 if (cats[c2] == 0 && samesets(g, c, c2)) 1518 cats[c2] = cat; 1519 } 1520} 1521 1522/* 1523 - dupl - emit a duplicate of a bunch of sops 1524 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1525 */ 1526static sopno /* start of duplicate */ 1527dupl(p, start, finish) 1528register struct parse *p; 1529sopno start; /* from here */ 1530sopno finish; /* to this less one */ 1531{ 1532 register sopno ret = HERE(); 1533 register sopno len = finish - start; 1534 1535 assert(finish >= start); 1536 if (len == 0) 1537 return(ret); 1538 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1539 assert(p->ssize >= p->slen + len); 1540 (void) memcpy((char *)(p->strip + p->slen), 1541 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1542 p->slen += len; 1543 return(ret); 1544} 1545 1546/* 1547 - doemit - emit a strip operator 1548 == static void doemit(register struct parse *p, sop op, size_t opnd); 1549 * 1550 * It might seem better to implement this as a macro with a function as 1551 * hard-case backup, but it's just too big and messy unless there are 1552 * some changes to the data structures. Maybe later. 1553 */ 1554static void 1555doemit(p, op, opnd) 1556register struct parse *p; 1557sop op; 1558size_t opnd; 1559{ 1560 /* avoid making error situations worse */ 1561 if (p->error != 0) 1562 return; 1563 1564 /* deal with oversize operands ("can't happen", more or less) */ 1565 assert(opnd < 1<<OPSHIFT); 1566 1567 /* deal with undersized strip */ 1568 if (p->slen >= p->ssize) 1569 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1570 assert(p->slen < p->ssize); 1571 1572 /* finally, it's all reduced to the easy case */ 1573 p->strip[p->slen++] = SOP(op, opnd); 1574} 1575 1576/* 1577 - doinsert - insert a sop into the strip 1578 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1579 */ 1580static void 1581doinsert(p, op, opnd, pos) 1582register struct parse *p; 1583sop op; 1584size_t opnd; 1585sopno pos; 1586{ 1587 register sopno sn; 1588 register sop s; 1589 register int i; 1590 1591 /* avoid making error situations worse */ 1592 if (p->error != 0) 1593 return; 1594 1595 sn = HERE(); 1596 EMIT(op, opnd); /* do checks, ensure space */ 1597 assert(HERE() == sn+1); 1598 s = p->strip[sn]; 1599 1600 /* adjust paren pointers */ 1601 assert(pos > 0); 1602 for (i = 1; i < NPAREN; i++) { 1603 if (p->pbegin[i] >= pos) { 1604 p->pbegin[i]++; 1605 } 1606 if (p->pend[i] >= pos) { 1607 p->pend[i]++; 1608 } 1609 } 1610 1611 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1612 (HERE()-pos-1)*sizeof(sop)); 1613 p->strip[pos] = s; 1614} 1615 1616/* 1617 - dofwd - complete a forward reference 1618 == static void dofwd(register struct parse *p, sopno pos, sop value); 1619 */ 1620static void 1621dofwd(p, pos, value) 1622register struct parse *p; 1623register sopno pos; 1624sop value; 1625{ 1626 /* avoid making error situations worse */ 1627 if (p->error != 0) 1628 return; 1629 1630 assert(value < 1<<OPSHIFT); 1631 p->strip[pos] = OP(p->strip[pos]) | value; 1632} 1633 1634/* 1635 - enlarge - enlarge the strip 1636 == static void enlarge(register struct parse *p, sopno size); 1637 */ 1638static void 1639enlarge(p, size) 1640register struct parse *p; 1641register sopno size; 1642{ 1643 register sop *sp; 1644 1645 if (p->ssize >= size) 1646 return; 1647 1648 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1649 if (sp == NULL) { 1650 SETERROR(REG_ESPACE); 1651 return; 1652 } 1653 p->strip = sp; 1654 p->ssize = size; 1655} 1656 1657/* 1658 - stripsnug - compact the strip 1659 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1660 */ 1661static void 1662stripsnug(p, g) 1663register struct parse *p; 1664register struct re_guts *g; 1665{ 1666 g->nstates = p->slen; 1667 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1668 if (g->strip == NULL) { 1669 SETERROR(REG_ESPACE); 1670 g->strip = p->strip; 1671 } 1672} 1673 1674/* 1675 - findmust - fill in must and mlen with longest mandatory literal string 1676 == static void findmust(register struct parse *p, register struct re_guts *g); 1677 * 1678 * This algorithm could do fancy things like analyzing the operands of | 1679 * for common subsequences. Someday. This code is simple and finds most 1680 * of the interesting cases. 1681 * 1682 * Note that must and mlen got initialized during setup. 1683 */ 1684static void 1685findmust(p, g) 1686struct parse *p; 1687register struct re_guts *g; 1688{ 1689 register sop *scan; 1690 sop *start; 1691 register sop *newstart; 1692 register sopno newlen; 1693 register sop s; 1694 register char *cp; 1695 register sopno i; 1696 1697 /* avoid making error situations worse */ 1698 if (p->error != 0) 1699 return; 1700 1701 /* find the longest OCHAR sequence in strip */ 1702 newlen = 0; 1703 scan = g->strip + 1; 1704 do { 1705 s = *scan++; 1706 switch (OP(s)) { 1707 case OCHAR: /* sequence member */ 1708 if (newlen == 0) /* new sequence */ 1709 newstart = scan - 1; 1710 newlen++; 1711 break; 1712 case OPLUS_: /* things that don't break one */ 1713 case OLPAREN: 1714 case ORPAREN: 1715 break; 1716 case OQUEST_: /* things that must be skipped */ 1717 case OCH_: 1718 scan--; 1719 do { 1720 scan += OPND(s); 1721 s = *scan; 1722 /* assert() interferes w debug printouts */ 1723 if (OP(s) != O_QUEST && OP(s) != O_CH && 1724 OP(s) != OOR2) { 1725 g->iflags |= BAD; 1726 return; 1727 } 1728 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1729 /* fallthrough */ 1730 default: /* things that break a sequence */ 1731 if (newlen > g->mlen) { /* ends one */ 1732 start = newstart; 1733 g->mlen = newlen; 1734 } 1735 newlen = 0; 1736 break; 1737 } 1738 } while (OP(s) != OEND); 1739 1740 if (g->mlen == 0) /* there isn't one */ 1741 return; 1742 1743 /* turn it into a character string */ 1744 g->must = malloc((size_t)g->mlen + 1); 1745 if (g->must == NULL) { /* argh; just forget it */ 1746 g->mlen = 0; 1747 return; 1748 } 1749 cp = g->must; 1750 scan = start; 1751 for (i = g->mlen; i > 0; i--) { 1752 while (OP(s = *scan++) != OCHAR) 1753 continue; 1754 assert(cp < g->must + g->mlen); 1755 *cp++ = (char)OPND(s); 1756 } 1757 assert(cp == g->must + g->mlen); 1758 *cp++ = '\0'; /* just on general principles */ 1759} 1760 1761/*
|