283 g->nplus = pluscount(p, g); 284 g->magic = MAGIC2; 285 preg->re_nsub = g->nsub; 286 preg->re_g = g; 287 preg->re_magic = MAGIC1; 288#ifndef REDEBUG 289 /* not debugging, so can't rely on the assert() in regexec() */ 290 if (g->iflags&BAD) 291 SETERROR(REG_ASSERT); 292#endif 293 294 /* win or lose, we're done */ 295 if (p->error != 0) /* lose */ 296 regfree(preg); 297 return(p->error); 298} 299 300/* 301 - p_ere - ERE parser top level, concatenation and alternation 302 == static void p_ere(register struct parse *p, int stop); 303 */ 304static void 305p_ere(p, stop) 306register struct parse *p; 307int stop; /* character this ERE should end at */ 308{ 309 register char c; 310 register sopno prevback; 311 register sopno prevfwd; 312 register sopno conc; 313 register int first = 1; /* is this the first alternative? */ 314 315 for (;;) { 316 /* do a bunch of concatenated expressions */ 317 conc = HERE(); 318 while (MORE() && (c = PEEK()) != '|' && c != stop) 319 p_ere_exp(p); 320 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 321 322 if (!EAT('|')) 323 break; /* NOTE BREAK OUT */ 324 325 if (first) { 326 INSERT(OCH_, conc); /* offset is wrong */ 327 prevfwd = conc; 328 prevback = conc; 329 first = 0; 330 } 331 ASTERN(OOR1, prevback); 332 prevback = THERE(); 333 AHEAD(prevfwd); /* fix previous offset */ 334 prevfwd = HERE(); 335 EMIT(OOR2, 0); /* offset is very wrong */ 336 } 337 338 if (!first) { /* tail-end fixups */ 339 AHEAD(prevfwd); 340 ASTERN(O_CH, prevback); 341 } 342 343 assert(!MORE() || SEE(stop)); 344} 345 346/* 347 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 348 == static void p_ere_exp(register struct parse *p); 349 */ 350static void 351p_ere_exp(p) 352register struct parse *p; 353{ 354 register char c; 355 register sopno pos; 356 register int count; 357 register int count2; 358 register sopno subno; 359 int wascaret = 0; 360 361 assert(MORE()); /* caller should have ensured this */ 362 c = GETNEXT(); 363 364 pos = HERE(); 365 switch (c) { 366 case '(': 367 (void)REQUIRE(MORE(), REG_EPAREN); 368 p->g->nsub++; 369 subno = p->g->nsub; 370 if (subno < NPAREN) 371 p->pbegin[subno] = HERE(); 372 EMIT(OLPAREN, subno); 373 if (!SEE(')')) 374 p_ere(p, ')'); 375 if (subno < NPAREN) { 376 p->pend[subno] = HERE(); 377 assert(p->pend[subno] != 0); 378 } 379 EMIT(ORPAREN, subno); 380 (void)MUSTEAT(')', REG_EPAREN); 381 break; 382#ifndef POSIX_MISTAKE 383 case ')': /* happens only if no current unmatched ( */ 384 /* 385 * You may ask, why the ifndef? Because I didn't notice 386 * this until slightly too late for 1003.2, and none of the 387 * other 1003.2 regular-expression reviewers noticed it at 388 * all. So an unmatched ) is legal POSIX, at least until 389 * we can get it fixed. 390 */ 391 SETERROR(REG_EPAREN); 392 break; 393#endif 394 case '^': 395 EMIT(OBOL, 0); 396 p->g->iflags |= USEBOL; 397 p->g->nbol++; 398 wascaret = 1; 399 break; 400 case '$': 401 EMIT(OEOL, 0); 402 p->g->iflags |= USEEOL; 403 p->g->neol++; 404 break; 405 case '|': 406 SETERROR(REG_EMPTY); 407 break; 408 case '*': 409 case '+': 410 case '?': 411 SETERROR(REG_BADRPT); 412 break; 413 case '.': 414 if (p->g->cflags®_NEWLINE) 415 nonnewline(p); 416 else 417 EMIT(OANY, 0); 418 break; 419 case '[': 420 p_bracket(p); 421 break; 422 case '\\': 423 (void)REQUIRE(MORE(), REG_EESCAPE); 424 c = GETNEXT(); 425 ordinary(p, c); 426 break; 427 case '{': /* okay as ordinary except if digit follows */ 428 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 429 /* FALLTHROUGH */ 430 default: 431 ordinary(p, c); 432 break; 433 } 434 435 if (!MORE()) 436 return; 437 c = PEEK(); 438 /* we call { a repetition if followed by a digit */ 439 if (!( c == '*' || c == '+' || c == '?' || 440 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 441 return; /* no repetition, we're done */ 442 NEXT(); 443 444 (void)REQUIRE(!wascaret, REG_BADRPT); 445 switch (c) { 446 case '*': /* implemented as +? */ 447 /* this case does not require the (y|) trick, noKLUDGE */ 448 INSERT(OPLUS_, pos); 449 ASTERN(O_PLUS, pos); 450 INSERT(OQUEST_, pos); 451 ASTERN(O_QUEST, pos); 452 break; 453 case '+': 454 INSERT(OPLUS_, pos); 455 ASTERN(O_PLUS, pos); 456 break; 457 case '?': 458 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 459 INSERT(OCH_, pos); /* offset slightly wrong */ 460 ASTERN(OOR1, pos); /* this one's right */ 461 AHEAD(pos); /* fix the OCH_ */ 462 EMIT(OOR2, 0); /* offset very wrong... */ 463 AHEAD(THERE()); /* ...so fix it */ 464 ASTERN(O_CH, THERETHERE()); 465 break; 466 case '{': 467 count = p_count(p); 468 if (EAT(',')) { 469 if (isdigit((uch)PEEK())) { 470 count2 = p_count(p); 471 (void)REQUIRE(count <= count2, REG_BADBR); 472 } else /* single number with comma */ 473 count2 = INFINITY; 474 } else /* just a single number */ 475 count2 = count; 476 repeat(p, pos, count, count2); 477 if (!EAT('}')) { /* error heuristics */ 478 while (MORE() && PEEK() != '}') 479 NEXT(); 480 (void)REQUIRE(MORE(), REG_EBRACE); 481 SETERROR(REG_BADBR); 482 } 483 break; 484 } 485 486 if (!MORE()) 487 return; 488 c = PEEK(); 489 if (!( c == '*' || c == '+' || c == '?' || 490 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 491 return; 492 SETERROR(REG_BADRPT); 493} 494 495/* 496 - p_str - string (no metacharacters) "parser" 497 == static void p_str(register struct parse *p); 498 */ 499static void 500p_str(p) 501register struct parse *p; 502{ 503 (void)REQUIRE(MORE(), REG_EMPTY); 504 while (MORE()) 505 ordinary(p, GETNEXT()); 506} 507 508/* 509 - p_bre - BRE parser top level, anchoring and concatenation 510 == static void p_bre(register struct parse *p, register int end1, \ 511 == register int end2); 512 * Giving end1 as OUT essentially eliminates the end1/end2 check. 513 * 514 * This implementation is a bit of a kludge, in that a trailing $ is first 515 * taken as an ordinary character and then revised to be an anchor. The 516 * only undesirable side effect is that '$' gets included as a character 517 * category in such cases. This is fairly harmless; not worth fixing. 518 * The amount of lookahead needed to avoid this kludge is excessive. 519 */ 520static void 521p_bre(p, end1, end2) 522register struct parse *p; 523register int end1; /* first terminating character */ 524register int end2; /* second terminating character */ 525{ 526 register sopno start = HERE(); 527 register int first = 1; /* first subexpression? */ 528 register int wasdollar = 0; 529 530 if (EAT('^')) { 531 EMIT(OBOL, 0); 532 p->g->iflags |= USEBOL; 533 p->g->nbol++; 534 } 535 while (MORE() && !SEETWO(end1, end2)) { 536 wasdollar = p_simp_re(p, first); 537 first = 0; 538 } 539 if (wasdollar) { /* oops, that was a trailing anchor */ 540 DROP(1); 541 EMIT(OEOL, 0); 542 p->g->iflags |= USEEOL; 543 p->g->neol++; 544 } 545 546 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 547} 548 549/* 550 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 551 == static int p_simp_re(register struct parse *p, int starordinary); 552 */ 553static int /* was the simple RE an unbackslashed $? */ 554p_simp_re(p, starordinary) 555register struct parse *p; 556int starordinary; /* is a leading * an ordinary character? */ 557{ 558 register int c; 559 register int count; 560 register int count2; 561 register sopno pos; 562 register int i; 563 register sopno subno; 564# define BACKSL (1<<CHAR_BIT) 565 566 pos = HERE(); /* repetion op, if any, covers from here */ 567 568 assert(MORE()); /* caller should have ensured this */ 569 c = GETNEXT(); 570 if (c == '\\') { 571 (void)REQUIRE(MORE(), REG_EESCAPE); 572 c = BACKSL | GETNEXT(); 573 } 574 switch (c) { 575 case '.': 576 if (p->g->cflags®_NEWLINE) 577 nonnewline(p); 578 else 579 EMIT(OANY, 0); 580 break; 581 case '[': 582 p_bracket(p); 583 break; 584 case BACKSL|'{': 585 SETERROR(REG_BADRPT); 586 break; 587 case BACKSL|'(': 588 p->g->nsub++; 589 subno = p->g->nsub; 590 if (subno < NPAREN) 591 p->pbegin[subno] = HERE(); 592 EMIT(OLPAREN, subno); 593 /* the MORE here is an error heuristic */ 594 if (MORE() && !SEETWO('\\', ')')) 595 p_bre(p, '\\', ')'); 596 if (subno < NPAREN) { 597 p->pend[subno] = HERE(); 598 assert(p->pend[subno] != 0); 599 } 600 EMIT(ORPAREN, subno); 601 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 602 break; 603 case BACKSL|')': /* should not get here -- must be user */ 604 case BACKSL|'}': 605 SETERROR(REG_EPAREN); 606 break; 607 case BACKSL|'1': 608 case BACKSL|'2': 609 case BACKSL|'3': 610 case BACKSL|'4': 611 case BACKSL|'5': 612 case BACKSL|'6': 613 case BACKSL|'7': 614 case BACKSL|'8': 615 case BACKSL|'9': 616 i = (c&~BACKSL) - '0'; 617 assert(i < NPAREN); 618 if (p->pend[i] != 0) { 619 assert(i <= p->g->nsub); 620 EMIT(OBACK_, i); 621 assert(p->pbegin[i] != 0); 622 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 623 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 624 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 625 EMIT(O_BACK, i); 626 } else 627 SETERROR(REG_ESUBREG); 628 p->g->backrefs = 1; 629 break; 630 case '*': 631 (void)REQUIRE(starordinary, REG_BADRPT); 632 /* FALLTHROUGH */ 633 default: 634 ordinary(p, (char)c); 635 break; 636 } 637 638 if (EAT('*')) { /* implemented as +? */ 639 /* this case does not require the (y|) trick, noKLUDGE */ 640 INSERT(OPLUS_, pos); 641 ASTERN(O_PLUS, pos); 642 INSERT(OQUEST_, pos); 643 ASTERN(O_QUEST, pos); 644 } else if (EATTWO('\\', '{')) { 645 count = p_count(p); 646 if (EAT(',')) { 647 if (MORE() && isdigit((uch)PEEK())) { 648 count2 = p_count(p); 649 (void)REQUIRE(count <= count2, REG_BADBR); 650 } else /* single number with comma */ 651 count2 = INFINITY; 652 } else /* just a single number */ 653 count2 = count; 654 repeat(p, pos, count, count2); 655 if (!EATTWO('\\', '}')) { /* error heuristics */ 656 while (MORE() && !SEETWO('\\', '}')) 657 NEXT(); 658 (void)REQUIRE(MORE(), REG_EBRACE); 659 SETERROR(REG_BADBR); 660 } 661 } else if (c == '$') /* $ (but not \$) ends it */ 662 return(1); 663 664 return(0); 665} 666 667/* 668 - p_count - parse a repetition count 669 == static int p_count(register struct parse *p); 670 */ 671static int /* the value */ 672p_count(p) 673register struct parse *p; 674{ 675 register int count = 0; 676 register int ndigits = 0; 677 678 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 679 count = count*10 + (GETNEXT() - '0'); 680 ndigits++; 681 } 682 683 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 684 return(count); 685} 686 687/* 688 - p_bracket - parse a bracketed character list 689 == static void p_bracket(register struct parse *p); 690 * 691 * Note a significant property of this code: if the allocset() did SETERROR, 692 * no set operations are done. 693 */ 694static void 695p_bracket(p) 696register struct parse *p; 697{ 698 register cset *cs = allocset(p); 699 register int invert = 0; 700 701 /* Dept of Truly Sickening Special-Case Kludges */ 702 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 703 EMIT(OBOW, 0); 704 NEXTn(6); 705 return; 706 } 707 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 708 EMIT(OEOW, 0); 709 NEXTn(6); 710 return; 711 } 712 713 if (EAT('^')) 714 invert++; /* make note to invert set at end */ 715 if (EAT(']')) 716 CHadd(cs, ']'); 717 else if (EAT('-')) 718 CHadd(cs, '-'); 719 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 720 p_b_term(p, cs); 721 if (EAT('-')) 722 CHadd(cs, '-'); 723 (void)MUSTEAT(']', REG_EBRACK); 724 725 if (p->error != 0) /* don't mess things up further */ 726 return; 727 728 if (p->g->cflags®_ICASE) { 729 register int i; 730 register int ci; 731 732 for (i = p->g->csetsize - 1; i >= 0; i--) 733 if (CHIN(cs, i) && isalpha(i)) { 734 ci = othercase(i); 735 if (ci != i) 736 CHadd(cs, ci); 737 } 738 if (cs->multis != NULL) 739 mccase(p, cs); 740 } 741 if (invert) { 742 register int i; 743 744 for (i = p->g->csetsize - 1; i >= 0; i--) 745 if (CHIN(cs, i)) 746 CHsub(cs, i); 747 else 748 CHadd(cs, i); 749 if (p->g->cflags®_NEWLINE) 750 CHsub(cs, '\n'); 751 if (cs->multis != NULL) 752 mcinvert(p, cs); 753 } 754 755 assert(cs->multis == NULL); /* xxx */ 756 757 if (nch(p, cs) == 1) { /* optimize singleton sets */ 758 ordinary(p, firstch(p, cs)); 759 freeset(p, cs); 760 } else 761 EMIT(OANYOF, freezeset(p, cs)); 762} 763 764/* 765 - p_b_term - parse one term of a bracketed character list 766 == static void p_b_term(register struct parse *p, register cset *cs); 767 */ 768static void 769p_b_term(p, cs) 770register struct parse *p; 771register cset *cs; 772{ 773 register char c; 774 register char start, finish; 775 register int i; 776 777 /* classify what we've got */ 778 switch ((MORE()) ? PEEK() : '\0') { 779 case '[': 780 c = (MORE2()) ? PEEK2() : '\0'; 781 break; 782 case '-': 783 SETERROR(REG_ERANGE); 784 return; /* NOTE RETURN */ 785 break; 786 default: 787 c = '\0'; 788 break; 789 } 790 791 switch (c) { 792 case ':': /* character class */ 793 NEXT2(); 794 (void)REQUIRE(MORE(), REG_EBRACK); 795 c = PEEK(); 796 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 797 p_b_cclass(p, cs); 798 (void)REQUIRE(MORE(), REG_EBRACK); 799 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 800 break; 801 case '=': /* equivalence class */ 802 NEXT2(); 803 (void)REQUIRE(MORE(), REG_EBRACK); 804 c = PEEK(); 805 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 806 p_b_eclass(p, cs); 807 (void)REQUIRE(MORE(), REG_EBRACK); 808 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 809 break; 810 default: /* symbol, ordinary character, or range */ 811/* xxx revision needed for multichar stuff */ 812 start = p_b_symbol(p); 813 if (SEE('-') && MORE2() && PEEK2() != ']') { 814 /* range */ 815 NEXT(); 816 if (EAT('-')) 817 finish = '-'; 818 else 819 finish = p_b_symbol(p); 820 } else 821 finish = start; 822 if (start == finish) 823 CHadd(cs, start); 824 else { 825 if (__collate_load_error) { 826 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 827 for (i = (uch)start; i <= (uch)finish; i++) 828 CHadd(cs, i); 829 } else { 830 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 831 for (i = CHAR_MIN; i <= CHAR_MAX; i++) { 832 if ( __collate_range_cmp(start, i) <= 0 833 && __collate_range_cmp(i, finish) <= 0 834 ) 835 CHadd(cs, i); 836 } 837 } 838 } 839 break; 840 } 841} 842 843/* 844 - p_b_cclass - parse a character-class name and deal with it 845 == static void p_b_cclass(register struct parse *p, register cset *cs); 846 */ 847static void 848p_b_cclass(p, cs) 849register struct parse *p; 850register cset *cs; 851{ 852 register int c; 853 register char *sp = p->next; 854 register struct cclass *cp; 855 register size_t len; 856 857 while (MORE() && isalpha((uch)PEEK())) 858 NEXT(); 859 len = p->next - sp; 860 for (cp = cclasses; cp->name != NULL; cp++) 861 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 862 break; 863 if (cp->name == NULL) { 864 /* oops, didn't find it */ 865 SETERROR(REG_ECTYPE); 866 return; 867 } 868 869 switch (cp->fidx) { 870 case CALNUM: 871 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 872 if (isalnum((uch)c)) 873 CHadd(cs, c); 874 break; 875 case CALPHA: 876 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 877 if (isalpha((uch)c)) 878 CHadd(cs, c); 879 break; 880 case CBLANK: 881 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 882 if (isblank((uch)c)) 883 CHadd(cs, c); 884 break; 885 case CCNTRL: 886 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 887 if (iscntrl((uch)c)) 888 CHadd(cs, c); 889 break; 890 case CDIGIT: 891 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 892 if (isdigit((uch)c)) 893 CHadd(cs, c); 894 break; 895 case CGRAPH: 896 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 897 if (isgraph((uch)c)) 898 CHadd(cs, c); 899 break; 900 case CLOWER: 901 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 902 if (islower((uch)c)) 903 CHadd(cs, c); 904 break; 905 case CPRINT: 906 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 907 if (isprint((uch)c)) 908 CHadd(cs, c); 909 break; 910 case CPUNCT: 911 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 912 if (ispunct((uch)c)) 913 CHadd(cs, c); 914 break; 915 case CSPACE: 916 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 917 if (isspace((uch)c)) 918 CHadd(cs, c); 919 break; 920 case CUPPER: 921 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 922 if (isupper((uch)c)) 923 CHadd(cs, c); 924 break; 925 case CXDIGIT: 926 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 927 if (isxdigit((uch)c)) 928 CHadd(cs, c); 929 break; 930 } 931#if 0 932 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 933 MCadd(p, cs, u); 934#endif 935} 936 937/* 938 - p_b_eclass - parse an equivalence-class name and deal with it 939 == static void p_b_eclass(register struct parse *p, register cset *cs); 940 * 941 * This implementation is incomplete. xxx 942 */ 943static void 944p_b_eclass(p, cs) 945register struct parse *p; 946register cset *cs; 947{ 948 register char c; 949 950 c = p_b_coll_elem(p, '='); 951 CHadd(cs, c); 952} 953 954/* 955 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 956 == static char p_b_symbol(register struct parse *p); 957 */ 958static char /* value of symbol */ 959p_b_symbol(p) 960register struct parse *p; 961{ 962 register char value; 963 964 (void)REQUIRE(MORE(), REG_EBRACK); 965 if (!EATTWO('[', '.')) 966 return(GETNEXT()); 967 968 /* collating symbol */ 969 value = p_b_coll_elem(p, '.'); 970 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 971 return(value); 972} 973 974/* 975 - p_b_coll_elem - parse a collating-element name and look it up 976 == static char p_b_coll_elem(register struct parse *p, int endc); 977 */ 978static char /* value of collating element */ 979p_b_coll_elem(p, endc) 980register struct parse *p; 981int endc; /* name ended by endc,']' */ 982{ 983 register char *sp = p->next; 984 register struct cname *cp; 985 register int len; 986 987 while (MORE() && !SEETWO(endc, ']')) 988 NEXT(); 989 if (!MORE()) { 990 SETERROR(REG_EBRACK); 991 return(0); 992 } 993 len = p->next - sp; 994 for (cp = cnames; cp->name != NULL; cp++) 995 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 996 return(cp->code); /* known name */ 997 if (len == 1) 998 return(*sp); /* single character */ 999 SETERROR(REG_ECOLLATE); /* neither */ 1000 return(0); 1001} 1002 1003/* 1004 - othercase - return the case counterpart of an alphabetic 1005 == static char othercase(int ch); 1006 */ 1007static char /* if no counterpart, return ch */ 1008othercase(ch) 1009int ch; 1010{ 1011 ch = (uch)ch; 1012 assert(isalpha(ch)); 1013 if (isupper(ch)) 1014 return(tolower(ch)); 1015 else if (islower(ch)) 1016 return(toupper(ch)); 1017 else /* peculiar, but could happen */ 1018 return(ch); 1019} 1020 1021/* 1022 - bothcases - emit a dualcase version of a two-case character 1023 == static void bothcases(register struct parse *p, int ch); 1024 * 1025 * Boy, is this implementation ever a kludge... 1026 */ 1027static void 1028bothcases(p, ch) 1029register struct parse *p; 1030int ch; 1031{ 1032 register char *oldnext = p->next; 1033 register char *oldend = p->end; 1034 char bracket[3]; 1035 1036 ch = (uch)ch; 1037 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1038 p->next = bracket; 1039 p->end = bracket+2; 1040 bracket[0] = ch; 1041 bracket[1] = ']'; 1042 bracket[2] = '\0'; 1043 p_bracket(p); 1044 assert(p->next == bracket+2); 1045 p->next = oldnext; 1046 p->end = oldend; 1047} 1048 1049/* 1050 - ordinary - emit an ordinary character 1051 == static void ordinary(register struct parse *p, register int ch); 1052 */ 1053static void 1054ordinary(p, ch) 1055register struct parse *p; 1056register int ch; 1057{ 1058 register cat_t *cap = p->g->categories; 1059 1060 if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 1061 bothcases(p, ch); 1062 else { 1063 EMIT(OCHAR, (uch)ch); 1064 if (cap[ch] == 0) 1065 cap[ch] = p->g->ncategories++; 1066 } 1067} 1068 1069/* 1070 - nonnewline - emit REG_NEWLINE version of OANY 1071 == static void nonnewline(register struct parse *p); 1072 * 1073 * Boy, is this implementation ever a kludge... 1074 */ 1075static void 1076nonnewline(p) 1077register struct parse *p; 1078{ 1079 register char *oldnext = p->next; 1080 register char *oldend = p->end; 1081 char bracket[4]; 1082 1083 p->next = bracket; 1084 p->end = bracket+3; 1085 bracket[0] = '^'; 1086 bracket[1] = '\n'; 1087 bracket[2] = ']'; 1088 bracket[3] = '\0'; 1089 p_bracket(p); 1090 assert(p->next == bracket+3); 1091 p->next = oldnext; 1092 p->end = oldend; 1093} 1094 1095/* 1096 - repeat - generate code for a bounded repetition, recursively if needed 1097 == static void repeat(register struct parse *p, sopno start, int from, int to); 1098 */ 1099static void 1100repeat(p, start, from, to) 1101register struct parse *p; 1102sopno start; /* operand from here to end of strip */ 1103int from; /* repeated from this number */ 1104int to; /* to this number of times (maybe INFINITY) */ 1105{ 1106 register sopno finish = HERE(); 1107# define N 2 1108# define INF 3 1109# define REP(f, t) ((f)*8 + (t)) 1110# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1111 register sopno copy; 1112 1113 if (p->error != 0) /* head off possible runaway recursion */ 1114 return; 1115 1116 assert(from <= to); 1117 1118 switch (REP(MAP(from), MAP(to))) { 1119 case REP(0, 0): /* must be user doing this */ 1120 DROP(finish-start); /* drop the operand */ 1121 break; 1122 case REP(0, 1): /* as x{1,1}? */ 1123 case REP(0, N): /* as x{1,n}? */ 1124 case REP(0, INF): /* as x{1,}? */ 1125 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1126 INSERT(OCH_, start); /* offset is wrong... */ 1127 repeat(p, start+1, 1, to); 1128 ASTERN(OOR1, start); 1129 AHEAD(start); /* ... fix it */ 1130 EMIT(OOR2, 0); 1131 AHEAD(THERE()); 1132 ASTERN(O_CH, THERETHERE()); 1133 break; 1134 case REP(1, 1): /* trivial case */ 1135 /* done */ 1136 break; 1137 case REP(1, N): /* as x?x{1,n-1} */ 1138 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1139 INSERT(OCH_, start); 1140 ASTERN(OOR1, start); 1141 AHEAD(start); 1142 EMIT(OOR2, 0); /* offset very wrong... */ 1143 AHEAD(THERE()); /* ...so fix it */ 1144 ASTERN(O_CH, THERETHERE()); 1145 copy = dupl(p, start+1, finish+1); 1146 assert(copy == finish+4); 1147 repeat(p, copy, 1, to-1); 1148 break; 1149 case REP(1, INF): /* as x+ */ 1150 INSERT(OPLUS_, start); 1151 ASTERN(O_PLUS, start); 1152 break; 1153 case REP(N, N): /* as xx{m-1,n-1} */ 1154 copy = dupl(p, start, finish); 1155 repeat(p, copy, from-1, to-1); 1156 break; 1157 case REP(N, INF): /* as xx{n-1,INF} */ 1158 copy = dupl(p, start, finish); 1159 repeat(p, copy, from-1, to); 1160 break; 1161 default: /* "can't happen" */ 1162 SETERROR(REG_ASSERT); /* just in case */ 1163 break; 1164 } 1165} 1166 1167/* 1168 - seterr - set an error condition 1169 == static int seterr(register struct parse *p, int e); 1170 */ 1171static int /* useless but makes type checking happy */ 1172seterr(p, e) 1173register struct parse *p; 1174int e; 1175{ 1176 if (p->error == 0) /* keep earliest error condition */ 1177 p->error = e; 1178 p->next = nuls; /* try to bring things to a halt */ 1179 p->end = nuls; 1180 return(0); /* make the return value well-defined */ 1181} 1182 1183/* 1184 - allocset - allocate a set of characters for [] 1185 == static cset *allocset(register struct parse *p); 1186 */ 1187static cset * 1188allocset(p) 1189register struct parse *p; 1190{ 1191 register int no = p->g->ncsets++; 1192 register size_t nc; 1193 register size_t nbytes; 1194 register cset *cs; 1195 register size_t css = (size_t)p->g->csetsize; 1196 register int i; 1197 1198 if (no >= p->ncsalloc) { /* need another column of space */ 1199 p->ncsalloc += CHAR_BIT; 1200 nc = p->ncsalloc; 1201 assert(nc % CHAR_BIT == 0); 1202 nbytes = nc / CHAR_BIT * css; 1203 if (p->g->sets == NULL) 1204 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1205 else 1206 p->g->sets = (cset *)reallocf((char *)p->g->sets, 1207 nc * sizeof(cset)); 1208 if (p->g->setbits == NULL) 1209 p->g->setbits = (uch *)malloc(nbytes); 1210 else { 1211 p->g->setbits = (uch *)reallocf((char *)p->g->setbits, 1212 nbytes); 1213 /* xxx this isn't right if setbits is now NULL */ 1214 for (i = 0; i < no; i++) 1215 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1216 } 1217 if (p->g->sets != NULL && p->g->setbits != NULL) 1218 (void) memset((char *)p->g->setbits + (nbytes - css), 1219 0, css); 1220 else { 1221 no = 0; 1222 SETERROR(REG_ESPACE); 1223 /* caller's responsibility not to do set ops */ 1224 } 1225 } 1226 1227 assert(p->g->sets != NULL); /* xxx */ 1228 cs = &p->g->sets[no]; 1229 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1230 cs->mask = 1 << ((no) % CHAR_BIT); 1231 cs->hash = 0; 1232 cs->smultis = 0; 1233 cs->multis = NULL; 1234 1235 return(cs); 1236} 1237 1238/* 1239 - freeset - free a now-unused set 1240 == static void freeset(register struct parse *p, register cset *cs); 1241 */ 1242static void 1243freeset(p, cs) 1244register struct parse *p; 1245register cset *cs; 1246{ 1247 register int i; 1248 register cset *top = &p->g->sets[p->g->ncsets]; 1249 register size_t css = (size_t)p->g->csetsize; 1250 1251 for (i = 0; i < css; i++) 1252 CHsub(cs, i); 1253 if (cs == top-1) /* recover only the easy case */ 1254 p->g->ncsets--; 1255} 1256 1257/* 1258 - freezeset - final processing on a set of characters 1259 == static int freezeset(register struct parse *p, register cset *cs); 1260 * 1261 * The main task here is merging identical sets. This is usually a waste 1262 * of time (although the hash code minimizes the overhead), but can win 1263 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1264 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1265 * the same value! 1266 */ 1267static int /* set number */ 1268freezeset(p, cs) 1269register struct parse *p; 1270register cset *cs; 1271{ 1272 register short h = cs->hash; 1273 register int i; 1274 register cset *top = &p->g->sets[p->g->ncsets]; 1275 register cset *cs2; 1276 register size_t css = (size_t)p->g->csetsize; 1277 1278 /* look for an earlier one which is the same */ 1279 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1280 if (cs2->hash == h && cs2 != cs) { 1281 /* maybe */ 1282 for (i = 0; i < css; i++) 1283 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1284 break; /* no */ 1285 if (i == css) 1286 break; /* yes */ 1287 } 1288 1289 if (cs2 < top) { /* found one */ 1290 freeset(p, cs); 1291 cs = cs2; 1292 } 1293 1294 return((int)(cs - p->g->sets)); 1295} 1296 1297/* 1298 - firstch - return first character in a set (which must have at least one) 1299 == static int firstch(register struct parse *p, register cset *cs); 1300 */ 1301static int /* character; there is no "none" value */ 1302firstch(p, cs) 1303register struct parse *p; 1304register cset *cs; 1305{ 1306 register int i; 1307 register size_t css = (size_t)p->g->csetsize; 1308 1309 for (i = 0; i < css; i++) 1310 if (CHIN(cs, i)) 1311 return((char)i); 1312 assert(never); 1313 return(0); /* arbitrary */ 1314} 1315 1316/* 1317 - nch - number of characters in a set 1318 == static int nch(register struct parse *p, register cset *cs); 1319 */ 1320static int 1321nch(p, cs) 1322register struct parse *p; 1323register cset *cs; 1324{ 1325 register int i; 1326 register size_t css = (size_t)p->g->csetsize; 1327 register int n = 0; 1328 1329 for (i = 0; i < css; i++) 1330 if (CHIN(cs, i)) 1331 n++; 1332 return(n); 1333} 1334 1335/* 1336 - mcadd - add a collating element to a cset 1337 == static void mcadd(register struct parse *p, register cset *cs, \ 1338 == register char *cp); 1339 */ 1340static void 1341mcadd(p, cs, cp) 1342register struct parse *p; 1343register cset *cs; 1344register char *cp; 1345{ 1346 register size_t oldend = cs->smultis; 1347 1348 cs->smultis += strlen(cp) + 1; 1349 if (cs->multis == NULL) 1350 cs->multis = malloc(cs->smultis); 1351 else 1352 cs->multis = reallocf(cs->multis, cs->smultis); 1353 if (cs->multis == NULL) { 1354 SETERROR(REG_ESPACE); 1355 return; 1356 } 1357 1358 (void) strcpy(cs->multis + oldend - 1, cp); 1359 cs->multis[cs->smultis - 1] = '\0'; 1360} 1361 1362#if used 1363/* 1364 - mcsub - subtract a collating element from a cset 1365 == static void mcsub(register cset *cs, register char *cp); 1366 */ 1367static void 1368mcsub(cs, cp) 1369register cset *cs; 1370register char *cp; 1371{ 1372 register char *fp = mcfind(cs, cp); 1373 register size_t len = strlen(fp); 1374 1375 assert(fp != NULL); 1376 (void) memmove(fp, fp + len + 1, 1377 cs->smultis - (fp + len + 1 - cs->multis)); 1378 cs->smultis -= len; 1379 1380 if (cs->smultis == 0) { 1381 free(cs->multis); 1382 cs->multis = NULL; 1383 return; 1384 } 1385 1386 cs->multis = reallocf(cs->multis, cs->smultis); 1387 assert(cs->multis != NULL); 1388} 1389 1390/* 1391 - mcin - is a collating element in a cset? 1392 == static int mcin(register cset *cs, register char *cp); 1393 */ 1394static int 1395mcin(cs, cp) 1396register cset *cs; 1397register char *cp; 1398{ 1399 return(mcfind(cs, cp) != NULL); 1400} 1401 1402/* 1403 - mcfind - find a collating element in a cset 1404 == static char *mcfind(register cset *cs, register char *cp); 1405 */ 1406static char * 1407mcfind(cs, cp) 1408register cset *cs; 1409register char *cp; 1410{ 1411 register char *p; 1412 1413 if (cs->multis == NULL) 1414 return(NULL); 1415 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1416 if (strcmp(cp, p) == 0) 1417 return(p); 1418 return(NULL); 1419} 1420#endif 1421 1422/* 1423 - mcinvert - invert the list of collating elements in a cset 1424 == static void mcinvert(register struct parse *p, register cset *cs); 1425 * 1426 * This would have to know the set of possibilities. Implementation 1427 * is deferred. 1428 */ 1429static void 1430mcinvert(p, cs) 1431register struct parse *p; 1432register cset *cs; 1433{ 1434 assert(cs->multis == NULL); /* xxx */ 1435} 1436 1437/* 1438 - mccase - add case counterparts of the list of collating elements in a cset 1439 == static void mccase(register struct parse *p, register cset *cs); 1440 * 1441 * This would have to know the set of possibilities. Implementation 1442 * is deferred. 1443 */ 1444static void 1445mccase(p, cs) 1446register struct parse *p; 1447register cset *cs; 1448{ 1449 assert(cs->multis == NULL); /* xxx */ 1450} 1451 1452/* 1453 - isinsets - is this character in any sets? 1454 == static int isinsets(register struct re_guts *g, int c); 1455 */ 1456static int /* predicate */ 1457isinsets(g, c) 1458register struct re_guts *g; 1459int c; 1460{ 1461 register uch *col; 1462 register int i; 1463 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1464 register unsigned uc = (uch)c; 1465 1466 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1467 if (col[uc] != 0) 1468 return(1); 1469 return(0); 1470} 1471 1472/* 1473 - samesets - are these two characters in exactly the same sets? 1474 == static int samesets(register struct re_guts *g, int c1, int c2); 1475 */ 1476static int /* predicate */ 1477samesets(g, c1, c2) 1478register struct re_guts *g; 1479int c1; 1480int c2; 1481{ 1482 register uch *col; 1483 register int i; 1484 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1485 register unsigned uc1 = (uch)c1; 1486 register unsigned uc2 = (uch)c2; 1487 1488 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1489 if (col[uc1] != col[uc2]) 1490 return(0); 1491 return(1); 1492} 1493 1494/* 1495 - categorize - sort out character categories 1496 == static void categorize(struct parse *p, register struct re_guts *g); 1497 */ 1498static void 1499categorize(p, g) 1500struct parse *p; 1501register struct re_guts *g; 1502{ 1503 register cat_t *cats = g->categories; 1504 register int c; 1505 register int c2; 1506 register cat_t cat; 1507 1508 /* avoid making error situations worse */ 1509 if (p->error != 0) 1510 return; 1511 1512 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1513 if (cats[c] == 0 && isinsets(g, c)) { 1514 cat = g->ncategories++; 1515 cats[c] = cat; 1516 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1517 if (cats[c2] == 0 && samesets(g, c, c2)) 1518 cats[c2] = cat; 1519 } 1520} 1521 1522/* 1523 - dupl - emit a duplicate of a bunch of sops 1524 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1525 */ 1526static sopno /* start of duplicate */ 1527dupl(p, start, finish) 1528register struct parse *p; 1529sopno start; /* from here */ 1530sopno finish; /* to this less one */ 1531{ 1532 register sopno ret = HERE(); 1533 register sopno len = finish - start; 1534 1535 assert(finish >= start); 1536 if (len == 0) 1537 return(ret); 1538 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1539 assert(p->ssize >= p->slen + len); 1540 (void) memcpy((char *)(p->strip + p->slen), 1541 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1542 p->slen += len; 1543 return(ret); 1544} 1545 1546/* 1547 - doemit - emit a strip operator 1548 == static void doemit(register struct parse *p, sop op, size_t opnd); 1549 * 1550 * It might seem better to implement this as a macro with a function as 1551 * hard-case backup, but it's just too big and messy unless there are 1552 * some changes to the data structures. Maybe later. 1553 */ 1554static void 1555doemit(p, op, opnd) 1556register struct parse *p; 1557sop op; 1558size_t opnd; 1559{ 1560 /* avoid making error situations worse */ 1561 if (p->error != 0) 1562 return; 1563 1564 /* deal with oversize operands ("can't happen", more or less) */ 1565 assert(opnd < 1<<OPSHIFT); 1566 1567 /* deal with undersized strip */ 1568 if (p->slen >= p->ssize) 1569 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1570 assert(p->slen < p->ssize); 1571 1572 /* finally, it's all reduced to the easy case */ 1573 p->strip[p->slen++] = SOP(op, opnd); 1574} 1575 1576/* 1577 - doinsert - insert a sop into the strip 1578 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1579 */ 1580static void 1581doinsert(p, op, opnd, pos) 1582register struct parse *p; 1583sop op; 1584size_t opnd; 1585sopno pos; 1586{ 1587 register sopno sn; 1588 register sop s; 1589 register int i; 1590 1591 /* avoid making error situations worse */ 1592 if (p->error != 0) 1593 return; 1594 1595 sn = HERE(); 1596 EMIT(op, opnd); /* do checks, ensure space */ 1597 assert(HERE() == sn+1); 1598 s = p->strip[sn]; 1599 1600 /* adjust paren pointers */ 1601 assert(pos > 0); 1602 for (i = 1; i < NPAREN; i++) { 1603 if (p->pbegin[i] >= pos) { 1604 p->pbegin[i]++; 1605 } 1606 if (p->pend[i] >= pos) { 1607 p->pend[i]++; 1608 } 1609 } 1610 1611 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1612 (HERE()-pos-1)*sizeof(sop)); 1613 p->strip[pos] = s; 1614} 1615 1616/* 1617 - dofwd - complete a forward reference 1618 == static void dofwd(register struct parse *p, sopno pos, sop value); 1619 */ 1620static void 1621dofwd(p, pos, value) 1622register struct parse *p; 1623register sopno pos; 1624sop value; 1625{ 1626 /* avoid making error situations worse */ 1627 if (p->error != 0) 1628 return; 1629 1630 assert(value < 1<<OPSHIFT); 1631 p->strip[pos] = OP(p->strip[pos]) | value; 1632} 1633 1634/* 1635 - enlarge - enlarge the strip 1636 == static void enlarge(register struct parse *p, sopno size); 1637 */ 1638static void 1639enlarge(p, size) 1640register struct parse *p; 1641register sopno size; 1642{ 1643 register sop *sp; 1644 1645 if (p->ssize >= size) 1646 return; 1647 1648 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1649 if (sp == NULL) { 1650 SETERROR(REG_ESPACE); 1651 return; 1652 } 1653 p->strip = sp; 1654 p->ssize = size; 1655} 1656 1657/* 1658 - stripsnug - compact the strip 1659 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1660 */ 1661static void 1662stripsnug(p, g) 1663register struct parse *p; 1664register struct re_guts *g; 1665{ 1666 g->nstates = p->slen; 1667 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1668 if (g->strip == NULL) { 1669 SETERROR(REG_ESPACE); 1670 g->strip = p->strip; 1671 } 1672} 1673 1674/* 1675 - findmust - fill in must and mlen with longest mandatory literal string 1676 == static void findmust(register struct parse *p, register struct re_guts *g); 1677 * 1678 * This algorithm could do fancy things like analyzing the operands of | 1679 * for common subsequences. Someday. This code is simple and finds most 1680 * of the interesting cases. 1681 * 1682 * Note that must and mlen got initialized during setup. 1683 */ 1684static void 1685findmust(p, g) 1686struct parse *p; 1687register struct re_guts *g; 1688{ 1689 register sop *scan; 1690 sop *start; 1691 register sop *newstart; 1692 register sopno newlen; 1693 register sop s; 1694 register char *cp; 1695 register sopno i; 1696 1697 /* avoid making error situations worse */ 1698 if (p->error != 0) 1699 return; 1700 1701 /* find the longest OCHAR sequence in strip */ 1702 newlen = 0; 1703 scan = g->strip + 1; 1704 do { 1705 s = *scan++; 1706 switch (OP(s)) { 1707 case OCHAR: /* sequence member */ 1708 if (newlen == 0) /* new sequence */ 1709 newstart = scan - 1; 1710 newlen++; 1711 break; 1712 case OPLUS_: /* things that don't break one */ 1713 case OLPAREN: 1714 case ORPAREN: 1715 break; 1716 case OQUEST_: /* things that must be skipped */ 1717 case OCH_: 1718 scan--; 1719 do { 1720 scan += OPND(s); 1721 s = *scan; 1722 /* assert() interferes w debug printouts */ 1723 if (OP(s) != O_QUEST && OP(s) != O_CH && 1724 OP(s) != OOR2) { 1725 g->iflags |= BAD; 1726 return; 1727 } 1728 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1729 /* fallthrough */ 1730 default: /* things that break a sequence */ 1731 if (newlen > g->mlen) { /* ends one */ 1732 start = newstart; 1733 g->mlen = newlen; 1734 } 1735 newlen = 0; 1736 break; 1737 } 1738 } while (OP(s) != OEND); 1739 1740 if (g->mlen == 0) /* there isn't one */ 1741 return; 1742 1743 /* turn it into a character string */ 1744 g->must = malloc((size_t)g->mlen + 1); 1745 if (g->must == NULL) { /* argh; just forget it */ 1746 g->mlen = 0; 1747 return; 1748 } 1749 cp = g->must; 1750 scan = start; 1751 for (i = g->mlen; i > 0; i--) { 1752 while (OP(s = *scan++) != OCHAR) 1753 continue; 1754 assert(cp < g->must + g->mlen); 1755 *cp++ = (char)OPND(s); 1756 } 1757 assert(cp == g->must + g->mlen); 1758 *cp++ = '\0'; /* just on general principles */ 1759} 1760 1761/*
|