1/* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22 *** to assist in implementing egrep. 23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25 *** as in BSD grep and ex. 26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald 31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h 32 *** was moved into regexp.h, and the include of regexp.h now uses "'s 33 *** to avoid conflicting with the system regexp.h. Const, bless its 34 *** soul, was removed so it can compile everywhere. The declaration 35 *** of strchr() was in conflict on AIX, so it was removed (as it is 36 *** happily defined in string.h). 37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald 38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes. 39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald 40 *** seiwald@perforce.com, on 05 November 2002, to const string literals. 41 * 42 * Beware that some of this code is subtly aware of the way operator 43 * precedence is structured in regular expressions. Serious changes in 44 * regular-expression syntax might require a total rethink. 45 */ 46#include "regexp.h" 47#include <stdio.h> 48#include <ctype.h> 49#ifndef ultrix 50#include <stdlib.h> 51#endif 52#include <string.h> 53 54/* 55 * The "internal use only" fields in regexp.h are present to pass info from 56 * compile to execute that permits the execute phase to run lots faster on 57 * simple cases. They are: 58 * 59 * regstart char that must begin a match; '\0' if none obvious 60 * reganch is the match anchored (at beginning-of-line only)? 61 * regmust string (pointer into program) that match must include, or NULL 62 * regmlen length of regmust string 63 * 64 * Regstart and reganch permit very fast decisions on suitable starting points 65 * for a match, cutting down the work a lot. Regmust permits fast rejection 66 * of lines that cannot possibly match. The regmust tests are costly enough 67 * that regcomp() supplies a regmust only if the r.e. contains something 68 * potentially expensive (at present, the only such thing detected is * or + 69 * at the start of the r.e., which can involve a lot of backup). Regmlen is 70 * supplied because the test in regexec() needs it and regcomp() is computing 71 * it anyway. 72 */ 73 74/* 75 * Structure for regexp "program". This is essentially a linear encoding 76 * of a nondeterministic finite-state machine (aka syntax charts or 77 * "railroad normal form" in parsing technology). Each node is an opcode 78 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 79 * all nodes except BRANCH implement concatenation; a "next" pointer with 80 * a BRANCH on both ends of it is connecting two alternatives. (Here we 81 * have one of the subtle syntax dependencies: an individual BRANCH (as 82 * opposed to a collection of them) is never concatenated with anything 83 * because of operator precedence.) The operand of some types of node is 84 * a literal string; for others, it is a node leading into a sub-FSM. In 85 * particular, the operand of a BRANCH node is the first node of the branch. 86 * (NB this is *not* a tree structure: the tail of the branch connects 87 * to the thing following the set of BRANCHes.) The opcodes are: 88 */ 89 90/* definition number opnd? meaning */ 91#define END 0 /* no End of program. */ 92#define BOL 1 /* no Match "" at beginning of line. */ 93#define EOL 2 /* no Match "" at end of line. */ 94#define ANY 3 /* no Match any one character. */ 95#define ANYOF 4 /* str Match any character in this string. */ 96#define ANYBUT 5 /* str Match any character not in this string. */ 97#define BRANCH 6 /* node Match this alternative, or the next... */ 98#define BACK 7 /* no Match "", "next" ptr points backward. */ 99#define EXACTLY 8 /* str Match this string. */ 100#define NOTHING 9 /* no Match empty string. */ 101#define STAR 10 /* node Match this (simple) thing 0 or more times. */ 102#define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 103#define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 104#define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 105#define OPEN 20 /* no Mark this point in input as start of #n. */ 106 /* OPEN+1 is number 1, etc. */ 107#define CLOSE 30 /* no Analogous to OPEN. */ 108 109/* 110 * Opcode notes: 111 * 112 * BRANCH The set of branches constituting a single choice are hooked 113 * together with their "next" pointers, since precedence prevents 114 * anything being concatenated to any individual branch. The 115 * "next" pointer of the last BRANCH in a choice points to the 116 * thing following the whole choice. This is also where the 117 * final "next" pointer of each individual branch points; each 118 * branch starts with the operand node of a BRANCH node. 119 * 120 * BACK Normal "next" pointers all implicitly point forward; BACK 121 * exists to make loop structures possible. 122 * 123 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 124 * BRANCH structures using BACK. Simple cases (one character 125 * per match) are implemented with STAR and PLUS for speed 126 * and to minimize recursive plunges. 127 * 128 * OPEN,CLOSE ...are numbered at compile time. 129 */ 130 131/* 132 * A node is one char of opcode followed by two chars of "next" pointer. 133 * "Next" pointers are stored as two 8-bit pieces, high order first. The 134 * value is a positive offset from the opcode of the node containing it. 135 * An operand, if any, simply follows the node. (Note that much of the 136 * code generation knows about this implicit relationship.) 137 * 138 * Using two bytes for the "next" pointer is vast overkill for most things, 139 * but allows patterns to get big without disasters. 140 */ 141#define OP(p) (*(p)) 142#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 143#define OPERAND(p) ((p) + 3) 144 145/* 146 * See regmagic.h for one further detail of program structure. 147 */ 148 149 150/* 151 * Utility definitions. 152 */ 153#ifndef CHARBITS 154#define UCHARAT(p) ((int)*(unsigned char *)(p)) 155#else 156#define UCHARAT(p) ((int)*(p)&CHARBITS) 157#endif 158 159#define FAIL(m) { regerror(m); return(NULL); } 160#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 161 162/* 163 * Flags to be passed up and down. 164 */ 165#define HASWIDTH 01 /* Known never to match null string. */ 166#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 167#define SPSTART 04 /* Starts with * or +. */ 168#define WORST 0 /* Worst case. */ 169 170/* 171 * Global work variables for regcomp(). 172 */ 173static char *regparse; /* Input-scan pointer. */ 174static int regnpar; /* () count. */ 175static char regdummy; 176static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 177static long regsize; /* Code size. */ 178 179/* 180 * Forward declarations for regcomp()'s friends. 181 */ 182#ifndef STATIC 183#define STATIC static 184#endif 185STATIC char *reg( int paren, int *flagp ); 186STATIC char *regbranch( int *flagp ); 187STATIC char *regpiece( int *flagp ); 188STATIC char *regatom( int *flagp ); 189STATIC char *regnode( int op ); 190STATIC char *regnext( register char *p ); 191STATIC void regc( int b ); 192STATIC void reginsert( char op, char *opnd ); 193STATIC void regtail( char *p, char *val ); 194STATIC void regoptail( char *p, char *val ); 195#ifdef STRCSPN 196STATIC int strcspn(); 197#endif 198 199/* 200 - regcomp - compile a regular expression into internal code 201 * 202 * We can't allocate space until we know how big the compiled form will be, 203 * but we can't compile it (and thus know how big it is) until we've got a 204 * place to put the code. So we cheat: we compile it twice, once with code 205 * generation turned off and size counting turned on, and once "for real". 206 * This also means that we don't allocate space until we are sure that the 207 * thing really will compile successfully, and we never have to move the 208 * code and thus invalidate pointers into it. (Note that it has to be in 209 * one piece because free() must be able to free it all.) 210 * 211 * Beware that the optimization-preparation code in here knows about some 212 * of the structure of the compiled regexp. 213 */ 214regexp * 215regcomp( const char *exp ) 216{ 217 register regexp *r; 218 register char *scan; 219 register char *longest; 220 register unsigned len; 221 int flags; 222 223 if (exp == NULL) 224 FAIL("NULL argument"); 225 226 /* First pass: determine size, legality. */ 227#ifdef notdef 228 if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ 229#endif 230 regparse = (char *)exp; 231 regnpar = 1; 232 regsize = 0L; 233 regcode = ®dummy; 234 regc(MAGIC); 235 if (reg(0, &flags) == NULL) 236 return(NULL); 237 238 /* Small enough for pointer-storage convention? */ 239 if (regsize >= 32767L) /* Probably could be 65535L. */ 240 FAIL("regexp too big"); 241 242 /* Allocate space. */ 243 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 244 if (r == NULL) 245 FAIL("out of space"); 246 247 /* Second pass: emit code. */ 248 regparse = (char *)exp; 249 regnpar = 1; 250 regcode = r->program; 251 regc(MAGIC); 252 if (reg(0, &flags) == NULL) 253 return(NULL); 254 255 /* Dig out information for optimizations. */ 256 r->regstart = '\0'; /* Worst-case defaults. */ 257 r->reganch = 0; 258 r->regmust = NULL; 259 r->regmlen = 0; 260 scan = r->program+1; /* First BRANCH. */ 261 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 262 scan = OPERAND(scan); 263 264 /* Starting-point info. */ 265 if (OP(scan) == EXACTLY) 266 r->regstart = *OPERAND(scan); 267 else if (OP(scan) == BOL) 268 r->reganch++; 269 270 /* 271 * If there's something expensive in the r.e., find the 272 * longest literal string that must appear and make it the 273 * regmust. Resolve ties in favor of later strings, since 274 * the regstart check works with the beginning of the r.e. 275 * and avoiding duplication strengthens checking. Not a 276 * strong reason, but sufficient in the absence of others. 277 */ 278 if (flags&SPSTART) { 279 longest = NULL; 280 len = 0; 281 for (; scan != NULL; scan = regnext(scan)) 282 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 283 longest = OPERAND(scan); 284 len = strlen(OPERAND(scan)); 285 } 286 r->regmust = longest; 287 r->regmlen = len; 288 } 289 } 290 291 return(r); 292} 293 294/* 295 - reg - regular expression, i.e. main body or parenthesized thing 296 * 297 * Caller must absorb opening parenthesis. 298 * 299 * Combining parenthesis handling with the base level of regular expression 300 * is a trifle forced, but the need to tie the tails of the branches to what 301 * follows makes it hard to avoid. 302 */ 303static char * 304reg( 305 int paren, /* Parenthesized? */ 306 int *flagp ) 307{ 308 register char *ret; 309 register char *br; 310 register char *ender; 311 register int parno; 312 int flags; 313 314 *flagp = HASWIDTH; /* Tentatively. */ 315 316 /* Make an OPEN node, if parenthesized. */ 317 if (paren) { 318 if (regnpar >= NSUBEXP) 319 FAIL("too many ()"); 320 parno = regnpar; 321 regnpar++; 322 ret = regnode(OPEN+parno); 323 } else 324 ret = NULL; 325 326 /* Pick up the branches, linking them together. */ 327 br = regbranch(&flags); 328 if (br == NULL) 329 return(NULL); 330 if (ret != NULL) 331 regtail(ret, br); /* OPEN -> first. */ 332 else 333 ret = br; 334 if (!(flags&HASWIDTH)) 335 *flagp &= ~HASWIDTH; 336 *flagp |= flags&SPSTART; 337 while (*regparse == '|' || *regparse == '\n') { 338 regparse++; 339 br = regbranch(&flags); 340 if (br == NULL) 341 return(NULL); 342 regtail(ret, br); /* BRANCH -> BRANCH. */ 343 if (!(flags&HASWIDTH)) 344 *flagp &= ~HASWIDTH; 345 *flagp |= flags&SPSTART; 346 } 347 348 /* Make a closing node, and hook it on the end. */ 349 ender = regnode((paren) ? CLOSE+parno : END); 350 regtail(ret, ender); 351 352 /* Hook the tails of the branches to the closing node. */ 353 for (br = ret; br != NULL; br = regnext(br)) 354 regoptail(br, ender); 355 356 /* Check for proper termination. */ 357 if (paren && *regparse++ != ')') { 358 FAIL("unmatched ()"); 359 } else if (!paren && *regparse != '\0') { 360 if (*regparse == ')') { 361 FAIL("unmatched ()"); 362 } else 363 FAIL("junk on end"); /* "Can't happen". */ 364 /* NOTREACHED */ 365 } 366 367 return(ret); 368} 369 370/* 371 - regbranch - one alternative of an | operator 372 * 373 * Implements the concatenation operator. 374 */ 375static char * 376regbranch( int *flagp ) 377{ 378 register char *ret; 379 register char *chain; 380 register char *latest; 381 int flags; 382 383 *flagp = WORST; /* Tentatively. */ 384 385 ret = regnode(BRANCH); 386 chain = NULL; 387 while (*regparse != '\0' && *regparse != ')' && 388 *regparse != '\n' && *regparse != '|') { 389 latest = regpiece(&flags); 390 if (latest == NULL) 391 return(NULL); 392 *flagp |= flags&HASWIDTH; 393 if (chain == NULL) /* First piece. */ 394 *flagp |= flags&SPSTART; 395 else 396 regtail(chain, latest); 397 chain = latest; 398 } 399 if (chain == NULL) /* Loop ran zero times. */ 400 (void) regnode(NOTHING); 401 402 return(ret); 403} 404 405/* 406 - regpiece - something followed by possible [*+?] 407 * 408 * Note that the branching code sequences used for ? and the general cases 409 * of * and + are somewhat optimized: they use the same NOTHING node as 410 * both the endmarker for their branch list and the body of the last branch. 411 * It might seem that this node could be dispensed with entirely, but the 412 * endmarker role is not redundant. 413 */ 414static char * 415regpiece( int *flagp ) 416{ 417 register char *ret; 418 register char op; 419 register char *next; 420 int flags; 421 422 ret = regatom(&flags); 423 if (ret == NULL) 424 return(NULL); 425 426 op = *regparse; 427 if (!ISMULT(op)) { 428 *flagp = flags; 429 return(ret); 430 } 431 432 if (!(flags&HASWIDTH) && op != '?') 433 FAIL("*+ operand could be empty"); 434 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 435 436 if (op == '*' && (flags&SIMPLE)) 437 reginsert(STAR, ret); 438 else if (op == '*') { 439 /* Emit x* as (x&|), where & means "self". */ 440 reginsert(BRANCH, ret); /* Either x */ 441 regoptail(ret, regnode(BACK)); /* and loop */ 442 regoptail(ret, ret); /* back */ 443 regtail(ret, regnode(BRANCH)); /* or */ 444 regtail(ret, regnode(NOTHING)); /* null. */ 445 } else if (op == '+' && (flags&SIMPLE)) 446 reginsert(PLUS, ret); 447 else if (op == '+') { 448 /* Emit x+ as x(&|), where & means "self". */ 449 next = regnode(BRANCH); /* Either */ 450 regtail(ret, next); 451 regtail(regnode(BACK), ret); /* loop back */ 452 regtail(next, regnode(BRANCH)); /* or */ 453 regtail(ret, regnode(NOTHING)); /* null. */ 454 } else if (op == '?') { 455 /* Emit x? as (x|) */ 456 reginsert(BRANCH, ret); /* Either x */ 457 regtail(ret, regnode(BRANCH)); /* or */ 458 next = regnode(NOTHING); /* null. */ 459 regtail(ret, next); 460 regoptail(ret, next); 461 } 462 regparse++; 463 if (ISMULT(*regparse)) 464 FAIL("nested *?+"); 465 466 return(ret); 467} 468 469/* 470 - regatom - the lowest level 471 * 472 * Optimization: gobbles an entire sequence of ordinary characters so that 473 * it can turn them into a single node, which is smaller to store and 474 * faster to run. Backslashed characters are exceptions, each becoming a 475 * separate node; the code is simpler that way and it's not worth fixing. 476 */ 477static char * 478regatom( int *flagp ) 479{ 480 register char *ret; 481 int flags; 482 483 *flagp = WORST; /* Tentatively. */ 484 485 switch (*regparse++) { 486 /* FIXME: these chars only have meaning at beg/end of pat? */ 487 case '^': 488 ret = regnode(BOL); 489 break; 490 case '$': 491 ret = regnode(EOL); 492 break; 493 case '.': 494 ret = regnode(ANY); 495 *flagp |= HASWIDTH|SIMPLE; 496 break; 497 case '[': { 498 register int classr; 499 register int classend; 500 501 if (*regparse == '^') { /* Complement of range. */ 502 ret = regnode(ANYBUT); 503 regparse++; 504 } else 505 ret = regnode(ANYOF); 506 if (*regparse == ']' || *regparse == '-') 507 regc(*regparse++); 508 while (*regparse != '\0' && *regparse != ']') { 509 if (*regparse == '-') { 510 regparse++; 511 if (*regparse == ']' || *regparse == '\0') 512 regc('-'); 513 else { 514 classr = UCHARAT(regparse-2)+1; 515 classend = UCHARAT(regparse); 516 if (classr > classend+1) 517 FAIL("invalid [] range"); 518 for (; classr <= classend; classr++) 519 regc(classr); 520 regparse++; 521 } 522 } else 523 regc(*regparse++); 524 } 525 regc('\0'); 526 if (*regparse != ']') 527 FAIL("unmatched []"); 528 regparse++; 529 *flagp |= HASWIDTH|SIMPLE; 530 } 531 break; 532 case '(': 533 ret = reg(1, &flags); 534 if (ret == NULL) 535 return(NULL); 536 *flagp |= flags&(HASWIDTH|SPSTART); 537 break; 538 case '\0': 539 case '|': 540 case '\n': 541 case ')': 542 FAIL("internal urp"); /* Supposed to be caught earlier. */ 543 break; 544 case '?': 545 case '+': 546 case '*': 547 FAIL("?+* follows nothing"); 548 break; 549 case '\\': 550 switch (*regparse++) { 551 case '\0': 552 FAIL("trailing \\"); 553 break; 554 case '<': 555 ret = regnode(WORDA); 556 break; 557 case '>': 558 ret = regnode(WORDZ); 559 break; 560 /* FIXME: Someday handle \1, \2, ... */ 561 default: 562 /* Handle general quoted chars in exact-match routine */ 563 goto de_fault; 564 } 565 break; 566 de_fault: 567 default: 568 /* 569 * Encode a string of characters to be matched exactly. 570 * 571 * This is a bit tricky due to quoted chars and due to 572 * '*', '+', and '?' taking the SINGLE char previous 573 * as their operand. 574 * 575 * On entry, the char at regparse[-1] is going to go 576 * into the string, no matter what it is. (It could be 577 * following a \ if we are entered from the '\' case.) 578 * 579 * Basic idea is to pick up a good char in ch and 580 * examine the next char. If it's *+? then we twiddle. 581 * If it's \ then we frozzle. If it's other magic char 582 * we push ch and terminate the string. If none of the 583 * above, we push ch on the string and go around again. 584 * 585 * regprev is used to remember where "the current char" 586 * starts in the string, if due to a *+? we need to back 587 * up and put the current char in a separate, 1-char, string. 588 * When regprev is NULL, ch is the only char in the 589 * string; this is used in *+? handling, and in setting 590 * flags |= SIMPLE at the end. 591 */ 592 { 593 char *regprev; 594 register char ch; 595 596 regparse--; /* Look at cur char */ 597 ret = regnode(EXACTLY); 598 for ( regprev = 0 ; ; ) { 599 ch = *regparse++; /* Get current char */ 600 switch (*regparse) { /* look at next one */ 601 602 default: 603 regc(ch); /* Add cur to string */ 604 break; 605 606 case '.': case '[': case '(': 607 case ')': case '|': case '\n': 608 case '$': case '^': 609 case '\0': 610 /* FIXME, $ and ^ should not always be magic */ 611 magic: 612 regc(ch); /* dump cur char */ 613 goto done; /* and we are done */ 614 615 case '?': case '+': case '*': 616 if (!regprev) /* If just ch in str, */ 617 goto magic; /* use it */ 618 /* End mult-char string one early */ 619 regparse = regprev; /* Back up parse */ 620 goto done; 621 622 case '\\': 623 regc(ch); /* Cur char OK */ 624 switch (regparse[1]){ /* Look after \ */ 625 case '\0': 626 case '<': 627 case '>': 628 /* FIXME: Someday handle \1, \2, ... */ 629 goto done; /* Not quoted */ 630 default: 631 /* Backup point is \, scan * point is after it. */ 632 regprev = regparse; 633 regparse++; 634 continue; /* NOT break; */ 635 } 636 } 637 regprev = regparse; /* Set backup point */ 638 } 639 done: 640 regc('\0'); 641 *flagp |= HASWIDTH; 642 if (!regprev) /* One char? */ 643 *flagp |= SIMPLE; 644 } 645 break; 646 } 647 648 return(ret); 649} 650 651/* 652 - regnode - emit a node 653 */ 654static char * /* Location. */ 655regnode( int op ) 656{ 657 register char *ret; 658 register char *ptr; 659 660 ret = regcode; 661 if (ret == ®dummy) { 662 regsize += 3; 663 return(ret); 664 } 665 666 ptr = ret; 667 *ptr++ = op; 668 *ptr++ = '\0'; /* Null "next" pointer. */ 669 *ptr++ = '\0'; 670 regcode = ptr; 671 672 return(ret); 673} 674 675/* 676 - regc - emit (if appropriate) a byte of code 677 */ 678static void 679regc( int b ) 680{ 681 if (regcode != ®dummy) 682 *regcode++ = b; 683 else 684 regsize++; 685} 686 687/* 688 - reginsert - insert an operator in front of already-emitted operand 689 * 690 * Means relocating the operand. 691 */ 692static void 693reginsert( 694 char op, 695 char *opnd ) 696{ 697 register char *src; 698 register char *dst; 699 register char *place; 700 701 if (regcode == ®dummy) { 702 regsize += 3; 703 return; 704 } 705 706 src = regcode; 707 regcode += 3; 708 dst = regcode; 709 while (src > opnd) 710 *--dst = *--src; 711 712 place = opnd; /* Op node, where operand used to be. */ 713 *place++ = op; 714 *place++ = '\0'; 715 *place++ = '\0'; 716} 717 718/* 719 - regtail - set the next-pointer at the end of a node chain 720 */ 721static void 722regtail( 723 char *p, 724 char *val ) 725{ 726 register char *scan; 727 register char *temp; 728 register int offset; 729 730 if (p == ®dummy) 731 return; 732 733 /* Find last node. */ 734 scan = p; 735 for (;;) { 736 temp = regnext(scan); 737 if (temp == NULL) 738 break; 739 scan = temp; 740 } 741 742 if (OP(scan) == BACK) 743 offset = scan - val; 744 else 745 offset = val - scan; 746 *(scan+1) = (offset>>8)&0377; 747 *(scan+2) = offset&0377; 748} 749 750/* 751 - regoptail - regtail on operand of first argument; nop if operandless 752 */ 753 754static void 755regoptail( 756 char *p, 757 char *val ) 758{ 759 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 760 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 761 return; 762 regtail(OPERAND(p), val); 763} 764 765/* 766 * regexec and friends 767 */ 768 769/* 770 * Global work variables for regexec(). 771 */ 772static const char *reginput; /* String-input pointer. */ 773static char *regbol; /* Beginning of input, for ^ check. */ 774static const char **regstartp; /* Pointer to startp array. */ 775static const char **regendp; /* Ditto for endp. */ 776 777/* 778 * Forwards. 779 */ 780STATIC int regtry( regexp *prog, const char *string ); 781STATIC int regmatch( char *prog ); 782STATIC int regrepeat( char *p ); 783 784#ifdef DEBUG 785int regnarrate = 0; 786void regdump(); 787STATIC char *regprop(); 788#endif 789 790/* 791 - regexec - match a regexp against a string 792 */ 793int 794regexec( 795 register regexp *prog, 796 register const char *string ) 797{ 798 register char *s; 799 800 /* Be paranoid... */ 801 if (prog == NULL || string == NULL) { 802 regerror("NULL parameter"); 803 return(0); 804 } 805 806 /* Check validity of program. */ 807 if (UCHARAT(prog->program) != MAGIC) { 808 regerror("corrupted program"); 809 return(0); 810 } 811 812 /* If there is a "must appear" string, look for it. */ 813 if (prog->regmust != NULL) { 814 s = (char *)string; 815 while ((s = strchr(s, prog->regmust[0])) != NULL) { 816 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 817 break; /* Found it. */ 818 s++; 819 } 820 if (s == NULL) /* Not present. */ 821 return(0); 822 } 823 824 /* Mark beginning of line for ^ . */ 825 regbol = (char *)string; 826 827 /* Simplest case: anchored match need be tried only once. */ 828 if (prog->reganch) 829 return(regtry(prog, string)); 830 831 /* Messy cases: unanchored match. */ 832 s = (char *)string; 833 if (prog->regstart != '\0') 834 /* We know what char it must start with. */ 835 while ((s = strchr(s, prog->regstart)) != NULL) { 836 if (regtry(prog, s)) 837 return(1); 838 s++; 839 } 840 else 841 /* We don't -- general case. */ 842 do { 843 if (regtry(prog, s)) 844 return(1); 845 } while (*s++ != '\0'); 846 847 /* Failure. */ 848 return(0); 849} 850 851/* 852 - regtry - try match at specific point 853 */ 854static int /* 0 failure, 1 success */ 855regtry( 856 regexp *prog, 857 const char *string ) 858{ 859 register int i; 860 register const char **sp; 861 register const char **ep; 862 863 reginput = string; 864 regstartp = prog->startp; 865 regendp = prog->endp; 866 867 sp = prog->startp; 868 ep = prog->endp; 869 for (i = NSUBEXP; i > 0; i--) { 870 *sp++ = NULL; 871 *ep++ = NULL; 872 } 873 if (regmatch(prog->program + 1)) { 874 prog->startp[0] = string; 875 prog->endp[0] = reginput; 876 return(1); 877 } else 878 return(0); 879} 880 881/* 882 - regmatch - main matching routine 883 * 884 * Conceptually the strategy is simple: check to see whether the current 885 * node matches, call self recursively to see whether the rest matches, 886 * and then act accordingly. In practice we make some effort to avoid 887 * recursion, in particular by going through "ordinary" nodes (that don't 888 * need to know whether the rest of the match failed) by a loop instead of 889 * by recursion. 890 */ 891static int /* 0 failure, 1 success */ 892regmatch( char *prog ) 893{ 894 register char *scan; /* Current node. */ 895 char *next; /* Next node. */ 896 897 scan = prog; 898#ifdef DEBUG 899 if (scan != NULL && regnarrate) 900 fprintf(stderr, "%s(\n", regprop(scan)); 901#endif 902 while (scan != NULL) { 903#ifdef DEBUG 904 if (regnarrate) 905 fprintf(stderr, "%s...\n", regprop(scan)); 906#endif 907 next = regnext(scan); 908 909 switch (OP(scan)) { 910 case BOL: 911 if (reginput != regbol) 912 return(0); 913 break; 914 case EOL: 915 if (*reginput != '\0') 916 return(0); 917 break; 918 case WORDA: 919 /* Must be looking at a letter, digit, or _ */ 920 if ((!isalnum(*reginput)) && *reginput != '_') 921 return(0); 922 /* Prev must be BOL or nonword */ 923 if (reginput > regbol && 924 (isalnum(reginput[-1]) || reginput[-1] == '_')) 925 return(0); 926 break; 927 case WORDZ: 928 /* Must be looking at non letter, digit, or _ */ 929 if (isalnum(*reginput) || *reginput == '_') 930 return(0); 931 /* We don't care what the previous char was */ 932 break; 933 case ANY: 934 if (*reginput == '\0') 935 return(0); 936 reginput++; 937 break; 938 case EXACTLY: { 939 register int len; 940 register char *opnd; 941 942 opnd = OPERAND(scan); 943 /* Inline the first character, for speed. */ 944 if (*opnd != *reginput) 945 return(0); 946 len = strlen(opnd); 947 if (len > 1 && strncmp(opnd, reginput, len) != 0) 948 return(0); 949 reginput += len; 950 } 951 break; 952 case ANYOF: 953 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 954 return(0); 955 reginput++; 956 break; 957 case ANYBUT: 958 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 959 return(0); 960 reginput++; 961 break; 962 case NOTHING: 963 break; 964 case BACK: 965 break; 966 case OPEN+1: 967 case OPEN+2: 968 case OPEN+3: 969 case OPEN+4: 970 case OPEN+5: 971 case OPEN+6: 972 case OPEN+7: 973 case OPEN+8: 974 case OPEN+9: { 975 register int no; 976 register const char *save; 977 978 no = OP(scan) - OPEN; 979 save = reginput; 980 981 if (regmatch(next)) { 982 /* 983 * Don't set startp if some later 984 * invocation of the same parentheses 985 * already has. 986 */ 987 if (regstartp[no] == NULL) 988 regstartp[no] = save; 989 return(1); 990 } else 991 return(0); 992 } 993 break; 994 case CLOSE+1: 995 case CLOSE+2: 996 case CLOSE+3: 997 case CLOSE+4: 998 case CLOSE+5: 999 case CLOSE+6: 1000 case CLOSE+7: 1001 case CLOSE+8: 1002 case CLOSE+9: { 1003 register int no; 1004 register const char *save; 1005 1006 no = OP(scan) - CLOSE; 1007 save = reginput; 1008 1009 if (regmatch(next)) { 1010 /* 1011 * Don't set endp if some later 1012 * invocation of the same parentheses 1013 * already has. 1014 */ 1015 if (regendp[no] == NULL) 1016 regendp[no] = save; 1017 return(1); 1018 } else 1019 return(0); 1020 } 1021 break; 1022 case BRANCH: { 1023 register const char *save; 1024 1025 if (OP(next) != BRANCH) /* No choice. */ 1026 next = OPERAND(scan); /* Avoid recursion. */ 1027 else { 1028 do { 1029 save = reginput; 1030 if (regmatch(OPERAND(scan))) 1031 return(1); 1032 reginput = save; 1033 scan = regnext(scan); 1034 } while (scan != NULL && OP(scan) == BRANCH); 1035 return(0); 1036 /* NOTREACHED */ 1037 } 1038 } 1039 break; 1040 case STAR: 1041 case PLUS: { 1042 register char nextch; 1043 register int no; 1044 register const char *save; 1045 register int min; 1046 1047 /* 1048 * Lookahead to avoid useless match attempts 1049 * when we know what character comes next. 1050 */ 1051 nextch = '\0'; 1052 if (OP(next) == EXACTLY) 1053 nextch = *OPERAND(next); 1054 min = (OP(scan) == STAR) ? 0 : 1; 1055 save = reginput; 1056 no = regrepeat(OPERAND(scan)); 1057 while (no >= min) { 1058 /* If it could work, try it. */ 1059 if (nextch == '\0' || *reginput == nextch) 1060 if (regmatch(next)) 1061 return(1); 1062 /* Couldn't or didn't -- back up. */ 1063 no--; 1064 reginput = save + no; 1065 } 1066 return(0); 1067 } 1068 break; 1069 case END: 1070 return(1); /* Success! */ 1071 break; 1072 default: 1073 regerror("memory corruption"); 1074 return(0); 1075 break; 1076 } 1077 1078 scan = next; 1079 } 1080 1081 /* 1082 * We get here only if there's trouble -- normally "case END" is 1083 * the terminating point. 1084 */ 1085 regerror("corrupted pointers"); 1086 return(0); 1087} 1088 1089/* 1090 - regrepeat - repeatedly match something simple, report how many 1091 */ 1092static int 1093regrepeat( char *p ) 1094{ 1095 register int count = 0; 1096 register const char *scan; 1097 register char *opnd; 1098 1099 scan = reginput; 1100 opnd = OPERAND(p); 1101 switch (OP(p)) { 1102 case ANY: 1103 count = strlen(scan); 1104 scan += count; 1105 break; 1106 case EXACTLY: 1107 while (*opnd == *scan) { 1108 count++; 1109 scan++; 1110 } 1111 break; 1112 case ANYOF: 1113 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1114 count++; 1115 scan++; 1116 } 1117 break; 1118 case ANYBUT: 1119 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1120 count++; 1121 scan++; 1122 } 1123 break; 1124 default: /* Oh dear. Called inappropriately. */ 1125 regerror("internal foulup"); 1126 count = 0; /* Best compromise. */ 1127 break; 1128 } 1129 reginput = scan; 1130 1131 return(count); 1132} 1133 1134/* 1135 - regnext - dig the "next" pointer out of a node 1136 */ 1137static char * 1138regnext( register char *p ) 1139{ 1140 register int offset; 1141 1142 if (p == ®dummy) 1143 return(NULL); 1144 1145 offset = NEXT(p); 1146 if (offset == 0) 1147 return(NULL); 1148 1149 if (OP(p) == BACK) 1150 return(p-offset); 1151 else 1152 return(p+offset); 1153} 1154 1155#ifdef DEBUG 1156 1157STATIC char *regprop(); 1158 1159/* 1160 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1161 */ 1162void 1163regdump( regexp *r ) 1164{ 1165 register char *s; 1166 register char op = EXACTLY; /* Arbitrary non-END op. */ 1167 register char *next; 1168 1169 1170 s = r->program + 1; 1171 while (op != END) { /* While that wasn't END last time... */ 1172 op = OP(s); 1173 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1174 next = regnext(s); 1175 if (next == NULL) /* Next ptr. */ 1176 printf("(0)"); 1177 else 1178 printf("(%d)", (s-r->program)+(next-s)); 1179 s += 3; 1180 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1181 /* Literal string, where present. */ 1182 while (*s != '\0') { 1183 putchar(*s); 1184 s++; 1185 } 1186 s++; 1187 } 1188 putchar('\n'); 1189 } 1190 1191 /* Header fields of interest. */ 1192 if (r->regstart != '\0') 1193 printf("start `%c' ", r->regstart); 1194 if (r->reganch) 1195 printf("anchored "); 1196 if (r->regmust != NULL) 1197 printf("must have \"%s\"", r->regmust); 1198 printf("\n"); 1199} 1200 1201/* 1202 - regprop - printable representation of opcode 1203 */ 1204static char * 1205regprop( char *op ) 1206{ 1207 register char *p; 1208 static char buf[50]; 1209 1210 (void) strcpy(buf, ":"); 1211 1212 switch (OP(op)) { 1213 case BOL: 1214 p = "BOL"; 1215 break; 1216 case EOL: 1217 p = "EOL"; 1218 break; 1219 case ANY: 1220 p = "ANY"; 1221 break; 1222 case ANYOF: 1223 p = "ANYOF"; 1224 break; 1225 case ANYBUT: 1226 p = "ANYBUT"; 1227 break; 1228 case BRANCH: 1229 p = "BRANCH"; 1230 break; 1231 case EXACTLY: 1232 p = "EXACTLY"; 1233 break; 1234 case NOTHING: 1235 p = "NOTHING"; 1236 break; 1237 case BACK: 1238 p = "BACK"; 1239 break; 1240 case END: 1241 p = "END"; 1242 break; 1243 case OPEN+1: 1244 case OPEN+2: 1245 case OPEN+3: 1246 case OPEN+4: 1247 case OPEN+5: 1248 case OPEN+6: 1249 case OPEN+7: 1250 case OPEN+8: 1251 case OPEN+9: 1252 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1253 p = NULL; 1254 break; 1255 case CLOSE+1: 1256 case CLOSE+2: 1257 case CLOSE+3: 1258 case CLOSE+4: 1259 case CLOSE+5: 1260 case CLOSE+6: 1261 case CLOSE+7: 1262 case CLOSE+8: 1263 case CLOSE+9: 1264 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1265 p = NULL; 1266 break; 1267 case STAR: 1268 p = "STAR"; 1269 break; 1270 case PLUS: 1271 p = "PLUS"; 1272 break; 1273 case WORDA: 1274 p = "WORDA"; 1275 break; 1276 case WORDZ: 1277 p = "WORDZ"; 1278 break; 1279 default: 1280 regerror("corrupted opcode"); 1281 break; 1282 } 1283 if (p != NULL) 1284 (void) strcat(buf, p); 1285 return(buf); 1286} 1287#endif 1288 1289/* 1290 * The following is provided for those people who do not have strcspn() in 1291 * their C libraries. They should get off their butts and do something 1292 * about it; at least one public-domain implementation of those (highly 1293 * useful) string routines has been published on Usenet. 1294 */ 1295#ifdef STRCSPN 1296/* 1297 * strcspn - find length of initial segment of s1 consisting entirely 1298 * of characters not from s2 1299 */ 1300 1301static int 1302strcspn( 1303 char *s1, 1304 char *s2 ) 1305{ 1306 register char *scan1; 1307 register char *scan2; 1308 register int count; 1309 1310 count = 0; 1311 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1312 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1313 if (*scan1 == *scan2++) 1314 return(count); 1315 count++; 1316 } 1317 return(count); 1318} 1319#endif 1320 1321void 1322regerror( const char *s ) 1323{ 1324 printf( "re error %s\n", s ); 1325} 1326