1/* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * @(#)regexp.c 1.3 of 18 April 87 4 * 5 * Copyright (c) 1986 by University of Toronto. 6 * Written by Henry Spencer. Not derived from licensed software. 7 * 8 * Permission is granted to anyone to use this software for any 9 * purpose on any computer system, and to redistribute it freely, 10 * subject to the following restrictions: 11 * 12 * 1. The author is not responsible for the consequences of use of 13 * this software, no matter how awful, even if they arise 14 * from defects in it. 15 * 16 * 2. The origin of this software must not be misrepresented, either 17 * by explicit claim or by omission. 18 * 19 * 3. Altered versions must be plainly marked as such, and must not 20 * be misrepresented as being the original software. 21 * 22 * Beware that some of this code is subtly aware of the way operator 23 * precedence is structured in regular expressions. Serious changes in 24 * regular-expression syntax might require a total rethink. 25 * 26 * This code was modified by Ethan Sommer to work within the kernel 27 * (it now uses kmalloc etc..) 28 * 29 * Modified slightly by Matthew Strait to use more modern C. 30 */ 31 32#include "regexp.h" 33#include "regmagic.h" 34 35/* added by ethan and matt. Lets it work in both kernel and user space. 36(So iptables can use it, for instance.) Yea, it goes both ways... */ 37#if __KERNEL__ 38 #define malloc(foo) kmalloc(foo,GFP_ATOMIC) 39#else 40 #define printk(format,args...) printf(format,##args) 41#endif 42 43void regerror(char * s) 44{ 45 printk("<3>Regexp: %s\n", s); 46 /* NOTREACHED */ 47} 48 49/* 50 * The "internal use only" fields in regexp.h are present to pass info from 51 * compile to execute that permits the execute phase to run lots faster on 52 * simple cases. They are: 53 * 54 * regstart char that must begin a match; '\0' if none obvious 55 * reganch is the match anchored (at beginning-of-line only)? 56 * regmust string (pointer into program) that match must include, or NULL 57 * regmlen length of regmust string 58 * 59 * Regstart and reganch permit very fast decisions on suitable starting points 60 * for a match, cutting down the work a lot. Regmust permits fast rejection 61 * of lines that cannot possibly match. The regmust tests are costly enough 62 * that regcomp() supplies a regmust only if the r.e. contains something 63 * potentially expensive (at present, the only such thing detected is * or + 64 * at the start of the r.e., which can involve a lot of backup). Regmlen is 65 * supplied because the test in regexec() needs it and regcomp() is computing 66 * it anyway. 67 */ 68 69/* 70 * Structure for regexp "program". This is essentially a linear encoding 71 * of a nondeterministic finite-state machine (aka syntax charts or 72 * "railroad normal form" in parsing technology). Each node is an opcode 73 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 74 * all nodes except BRANCH implement concatenation; a "next" pointer with 75 * a BRANCH on both ends of it is connecting two alternatives. (Here we 76 * have one of the subtle syntax dependencies: an individual BRANCH (as 77 * opposed to a collection of them) is never concatenated with anything 78 * because of operator precedence.) The operand of some types of node is 79 * a literal string; for others, it is a node leading into a sub-FSM. In 80 * particular, the operand of a BRANCH node is the first node of the branch. 81 * (NB this is *not* a tree structure: the tail of the branch connects 82 * to the thing following the set of BRANCHes.) The opcodes are: 83 */ 84 85/* definition number opnd? meaning */ 86#define END 0 /* no End of program. */ 87#define BOL 1 /* no Match "" at beginning of line. */ 88#define EOL 2 /* no Match "" at end of line. */ 89#define ANY 3 /* no Match any one character. */ 90#define ANYOF 4 /* str Match any character in this string. */ 91#define ANYBUT 5 /* str Match any character not in this string. */ 92#define BRANCH 6 /* node Match this alternative, or the next... */ 93#define BACK 7 /* no Match "", "next" ptr points backward. */ 94#define EXACTLY 8 /* str Match this string. */ 95#define NOTHING 9 /* no Match empty string. */ 96#define STAR 10 /* node Match this (simple) thing 0 or more times. */ 97#define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 98#define OPEN 20 /* no Mark this point in input as start of #n. */ 99 /* OPEN+1 is number 1, etc. */ 100#define CLOSE 30 /* no Analogous to OPEN. */ 101 102/* 103 * Opcode notes: 104 * 105 * BRANCH The set of branches constituting a single choice are hooked 106 * together with their "next" pointers, since precedence prevents 107 * anything being concatenated to any individual branch. The 108 * "next" pointer of the last BRANCH in a choice points to the 109 * thing following the whole choice. This is also where the 110 * final "next" pointer of each individual branch points; each 111 * branch starts with the operand node of a BRANCH node. 112 * 113 * BACK Normal "next" pointers all implicitly point forward; BACK 114 * exists to make loop structures possible. 115 * 116 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 117 * BRANCH structures using BACK. Simple cases (one character 118 * per match) are implemented with STAR and PLUS for speed 119 * and to minimize recursive plunges. 120 * 121 * OPEN,CLOSE ...are numbered at compile time. 122 */ 123 124/* 125 * A node is one char of opcode followed by two chars of "next" pointer. 126 * "Next" pointers are stored as two 8-bit pieces, high order first. The 127 * value is a positive offset from the opcode of the node containing it. 128 * An operand, if any, simply follows the node. (Note that much of the 129 * code generation knows about this implicit relationship.) 130 * 131 * Using two bytes for the "next" pointer is vast overkill for most things, 132 * but allows patterns to get big without disasters. 133 */ 134#define OP(p) (*(p)) 135#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 136#define OPERAND(p) ((p) + 3) 137 138/* 139 * See regmagic.h for one further detail of program structure. 140 */ 141 142 143/* 144 * Utility definitions. 145 */ 146#ifndef CHARBITS 147#define UCHARAT(p) ((int)*(unsigned char *)(p)) 148#else 149#define UCHARAT(p) ((int)*(p)&CHARBITS) 150#endif 151 152#define FAIL(m) { regerror(m); return(NULL); } 153#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 154#define META "^$.[()|?+*\\" 155 156/* 157 * Flags to be passed up and down. 158 */ 159#define HASWIDTH 01 /* Known never to match null string. */ 160#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 161#define SPSTART 04 /* Starts with * or +. */ 162#define WORST 0 /* Worst case. */ 163 164/* 165 * Global work variables for regcomp(). 166 */ 167static char *regparse; /* Input-scan pointer. */ 168static int regnpar; /* () count. */ 169static char regdummy; 170static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 171static long regsize; /* Code size. */ 172 173/* 174 * Forward declarations for regcomp()'s friends. 175 */ 176#ifndef STATIC 177#define STATIC static 178#endif 179STATIC char *reg(int paren,int *flagp); 180STATIC char *regbranch(int *flagp); 181STATIC char *regpiece(int *flagp); 182STATIC char *regatom(int *flagp); 183STATIC char *regnode(char op); 184STATIC char *regnext(char *p); 185STATIC void regc(char b); 186STATIC void reginsert(char op, char *opnd); 187STATIC void regtail(char *p, char *val); 188STATIC void regoptail(char *p, char *val); 189 190 191__kernel_size_t my_strcspn(const char *s1,const char *s2) 192{ 193 char *scan1; 194 char *scan2; 195 int count; 196 197 count = 0; 198 for (scan1 = (char *)s1; *scan1 != '\0'; scan1++) { 199 for (scan2 = (char *)s2; *scan2 != '\0';) /* ++ moved down. */ 200 if (*scan1 == *scan2++) 201 return(count); 202 count++; 203 } 204 return(count); 205} 206 207/* 208 - regcomp - compile a regular expression into internal code 209 * 210 * We can't allocate space until we know how big the compiled form will be, 211 * but we can't compile it (and thus know how big it is) until we've got a 212 * place to put the code. So we cheat: we compile it twice, once with code 213 * generation turned off and size counting turned on, and once "for real". 214 * This also means that we don't allocate space until we are sure that the 215 * thing really will compile successfully, and we never have to move the 216 * code and thus invalidate pointers into it. (Note that it has to be in 217 * one piece because free() must be able to free it all.) 218 * 219 * Beware that the optimization-preparation code in here knows about some 220 * of the structure of the compiled regexp. 221 */ 222regexp * 223regcomp(char *exp,int *patternsize) 224{ 225 register regexp *r; 226 register char *scan; 227 register char *longest; 228 register int len; 229 int flags; 230 /* commented out by ethan 231 extern char *malloc(); 232 */ 233 234 if (exp == NULL) 235 FAIL("NULL argument"); 236 237 /* First pass: determine size, legality. */ 238 regparse = exp; 239 regnpar = 1; 240 regsize = 0L; 241 regcode = ®dummy; 242 regc(MAGIC); 243 if (reg(0, &flags) == NULL) 244 return(NULL); 245 246 /* Small enough for pointer-storage convention? */ 247 if (regsize >= 32767L) /* Probably could be 65535L. */ 248 FAIL("regexp too big"); 249 250 /* Allocate space. */ 251 *patternsize=sizeof(regexp) + (unsigned)regsize; 252 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 253 if (r == NULL) 254 FAIL("out of space"); 255 256 /* Second pass: emit code. */ 257 regparse = exp; 258 regnpar = 1; 259 regcode = r->program; 260 regc(MAGIC); 261 if (reg(0, &flags) == NULL) 262 return(NULL); 263 264 /* Dig out information for optimizations. */ 265 r->regstart = '\0'; /* Worst-case defaults. */ 266 r->reganch = 0; 267 r->regmust = NULL; 268 r->regmlen = 0; 269 scan = r->program+1; /* First BRANCH. */ 270 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 271 scan = OPERAND(scan); 272 273 /* Starting-point info. */ 274 if (OP(scan) == EXACTLY) 275 r->regstart = *OPERAND(scan); 276 else if (OP(scan) == BOL) 277 r->reganch++; 278 279 /* 280 * If there's something expensive in the r.e., find the 281 * longest literal string that must appear and make it the 282 * regmust. Resolve ties in favor of later strings, since 283 * the regstart check works with the beginning of the r.e. 284 * and avoiding duplication strengthens checking. Not a 285 * strong reason, but sufficient in the absence of others. 286 */ 287 if (flags&SPSTART) { 288 longest = NULL; 289 len = 0; 290 for (; scan != NULL; scan = regnext(scan)) 291 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { 292 longest = OPERAND(scan); 293 len = strlen(OPERAND(scan)); 294 } 295 r->regmust = longest; 296 r->regmlen = len; 297 } 298 } 299 300 return(r); 301} 302 303/* 304 - reg - regular expression, i.e. main body or parenthesized thing 305 * 306 * Caller must absorb opening parenthesis. 307 * 308 * Combining parenthesis handling with the base level of regular expression 309 * is a trifle forced, but the need to tie the tails of the branches to what 310 * follows makes it hard to avoid. 311 */ 312static char * 313reg(int paren, int *flagp /* Parenthesized? */ ) 314{ 315 register char *ret; 316 register char *br; 317 register char *ender; 318 register int parno = 0; /* 0 makes gcc happy */ 319 int flags; 320 321 *flagp = HASWIDTH; /* Tentatively. */ 322 323 /* Make an OPEN node, if parenthesized. */ 324 if (paren) { 325 if (regnpar >= NSUBEXP) 326 FAIL("too many ()"); 327 parno = regnpar; 328 regnpar++; 329 ret = regnode(OPEN+parno); 330 } else 331 ret = NULL; 332 333 /* Pick up the branches, linking them together. */ 334 br = regbranch(&flags); 335 if (br == NULL) 336 return(NULL); 337 if (ret != NULL) 338 regtail(ret, br); /* OPEN -> first. */ 339 else 340 ret = br; 341 if (!(flags&HASWIDTH)) 342 *flagp &= ~HASWIDTH; 343 *flagp |= flags&SPSTART; 344 while (*regparse == '|') { 345 regparse++; 346 br = regbranch(&flags); 347 if (br == NULL) 348 return(NULL); 349 regtail(ret, br); /* BRANCH -> BRANCH. */ 350 if (!(flags&HASWIDTH)) 351 *flagp &= ~HASWIDTH; 352 *flagp |= flags&SPSTART; 353 } 354 355 /* Make a closing node, and hook it on the end. */ 356 ender = regnode((paren) ? CLOSE+parno : END); 357 regtail(ret, ender); 358 359 /* Hook the tails of the branches to the closing node. */ 360 for (br = ret; br != NULL; br = regnext(br)) 361 regoptail(br, ender); 362 363 /* Check for proper termination. */ 364 if (paren && *regparse++ != ')') { 365 FAIL("unmatched ()"); 366 } else if (!paren && *regparse != '\0') { 367 if (*regparse == ')') { 368 FAIL("unmatched ()"); 369 } else 370 FAIL("junk on end"); /* "Can't happen". */ 371 /* NOTREACHED */ 372 } 373 374 return(ret); 375} 376 377/* 378 - regbranch - one alternative of an | operator 379 * 380 * Implements the concatenation operator. 381 */ 382static char * 383regbranch(int *flagp) 384{ 385 register char *ret; 386 register char *chain; 387 register char *latest; 388 int flags; 389 390 *flagp = WORST; /* Tentatively. */ 391 392 ret = regnode(BRANCH); 393 chain = NULL; 394 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { 395 latest = regpiece(&flags); 396 if (latest == NULL) 397 return(NULL); 398 *flagp |= flags&HASWIDTH; 399 if (chain == NULL) /* First piece. */ 400 *flagp |= flags&SPSTART; 401 else 402 regtail(chain, latest); 403 chain = latest; 404 } 405 if (chain == NULL) /* Loop ran zero times. */ 406 (void) regnode(NOTHING); 407 408 return(ret); 409} 410 411/* 412 - regpiece - something followed by possible [*+?] 413 * 414 * Note that the branching code sequences used for ? and the general cases 415 * of * and + are somewhat optimized: they use the same NOTHING node as 416 * both the endmarker for their branch list and the body of the last branch. 417 * It might seem that this node could be dispensed with entirely, but the 418 * endmarker role is not redundant. 419 */ 420static char * 421regpiece(int *flagp) 422{ 423 register char *ret; 424 register char op; 425 register char *next; 426 int flags; 427 428 ret = regatom(&flags); 429 if (ret == NULL) 430 return(NULL); 431 432 op = *regparse; 433 if (!ISMULT(op)) { 434 *flagp = flags; 435 return(ret); 436 } 437 438 if (!(flags&HASWIDTH) && op != '?') 439 FAIL("*+ operand could be empty"); 440 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 441 442 if (op == '*' && (flags&SIMPLE)) 443 reginsert(STAR, ret); 444 else if (op == '*') { 445 /* Emit x* as (x&|), where & means "self". */ 446 reginsert(BRANCH, ret); /* Either x */ 447 regoptail(ret, regnode(BACK)); /* and loop */ 448 regoptail(ret, ret); /* back */ 449 regtail(ret, regnode(BRANCH)); /* or */ 450 regtail(ret, regnode(NOTHING)); /* null. */ 451 } else if (op == '+' && (flags&SIMPLE)) 452 reginsert(PLUS, ret); 453 else if (op == '+') { 454 /* Emit x+ as x(&|), where & means "self". */ 455 next = regnode(BRANCH); /* Either */ 456 regtail(ret, next); 457 regtail(regnode(BACK), ret); /* loop back */ 458 regtail(next, regnode(BRANCH)); /* or */ 459 regtail(ret, regnode(NOTHING)); /* null. */ 460 } else if (op == '?') { 461 /* Emit x? as (x|) */ 462 reginsert(BRANCH, ret); /* Either x */ 463 regtail(ret, regnode(BRANCH)); /* or */ 464 next = regnode(NOTHING); /* null. */ 465 regtail(ret, next); 466 regoptail(ret, next); 467 } 468 regparse++; 469 if (ISMULT(*regparse)) 470 FAIL("nested *?+"); 471 472 return(ret); 473} 474 475/* 476 - regatom - the lowest level 477 * 478 * Optimization: gobbles an entire sequence of ordinary characters so that 479 * it can turn them into a single node, which is smaller to store and 480 * faster to run. Backslashed characters are exceptions, each becoming a 481 * separate node; the code is simpler that way and it's not worth fixing. 482 */ 483static char * 484regatom(int *flagp) 485{ 486 register char *ret; 487 int flags; 488 489 *flagp = WORST; /* Tentatively. */ 490 491 switch (*regparse++) { 492 case '^': 493 ret = regnode(BOL); 494 break; 495 case '$': 496 ret = regnode(EOL); 497 break; 498 case '.': 499 ret = regnode(ANY); 500 *flagp |= HASWIDTH|SIMPLE; 501 break; 502 case '[': { 503 register int class; 504 register int classend; 505 506 if (*regparse == '^') { /* Complement of range. */ 507 ret = regnode(ANYBUT); 508 regparse++; 509 } else 510 ret = regnode(ANYOF); 511 if (*regparse == ']' || *regparse == '-') 512 regc(*regparse++); 513 while (*regparse != '\0' && *regparse != ']') { 514 if (*regparse == '-') { 515 regparse++; 516 if (*regparse == ']' || *regparse == '\0') 517 regc('-'); 518 else { 519 class = UCHARAT(regparse-2)+1; 520 classend = UCHARAT(regparse); 521 if (class > classend+1) 522 FAIL("invalid [] range"); 523 for (; class <= classend; class++) 524 regc(class); 525 regparse++; 526 } 527 } else 528 regc(*regparse++); 529 } 530 regc('\0'); 531 if (*regparse != ']') 532 FAIL("unmatched []"); 533 regparse++; 534 *flagp |= HASWIDTH|SIMPLE; 535 } 536 break; 537 case '(': 538 ret = reg(1, &flags); 539 if (ret == NULL) 540 return(NULL); 541 *flagp |= flags&(HASWIDTH|SPSTART); 542 break; 543 case '\0': 544 case '|': 545 case ')': 546 FAIL("internal urp"); /* Supposed to be caught earlier. */ 547 break; 548 case '?': 549 case '+': 550 case '*': 551 FAIL("?+* follows nothing"); 552 break; 553 case '\\': 554 if (*regparse == '\0') 555 FAIL("trailing \\"); 556 ret = regnode(EXACTLY); 557 regc(*regparse++); 558 regc('\0'); 559 *flagp |= HASWIDTH|SIMPLE; 560 break; 561 default: { 562 register int len; 563 register char ender; 564 565 regparse--; 566 len = my_strcspn((const char *)regparse, (const char *)META); 567 if (len <= 0) 568 FAIL("internal disaster"); 569 ender = *(regparse+len); 570 if (len > 1 && ISMULT(ender)) 571 len--; /* Back off clear of ?+* operand. */ 572 *flagp |= HASWIDTH; 573 if (len == 1) 574 *flagp |= SIMPLE; 575 ret = regnode(EXACTLY); 576 while (len > 0) { 577 regc(*regparse++); 578 len--; 579 } 580 regc('\0'); 581 } 582 break; 583 } 584 585 return(ret); 586} 587 588/* 589 - regnode - emit a node 590 */ 591static char * /* Location. */ 592regnode(char op) 593{ 594 register char *ret; 595 register char *ptr; 596 597 ret = regcode; 598 if (ret == ®dummy) { 599 regsize += 3; 600 return(ret); 601 } 602 603 ptr = ret; 604 *ptr++ = op; 605 *ptr++ = '\0'; /* Null "next" pointer. */ 606 *ptr++ = '\0'; 607 regcode = ptr; 608 609 return(ret); 610} 611 612/* 613 - regc - emit (if appropriate) a byte of code 614 */ 615static void 616regc(char b) 617{ 618 if (regcode != ®dummy) 619 *regcode++ = b; 620 else 621 regsize++; 622} 623 624/* 625 - reginsert - insert an operator in front of already-emitted operand 626 * 627 * Means relocating the operand. 628 */ 629static void 630reginsert(char op, char* opnd) 631{ 632 register char *src; 633 register char *dst; 634 register char *place; 635 636 if (regcode == ®dummy) { 637 regsize += 3; 638 return; 639 } 640 641 src = regcode; 642 regcode += 3; 643 dst = regcode; 644 while (src > opnd) 645 *--dst = *--src; 646 647 place = opnd; /* Op node, where operand used to be. */ 648 *place++ = op; 649 *place++ = '\0'; 650 *place++ = '\0'; 651} 652 653/* 654 - regtail - set the next-pointer at the end of a node chain 655 */ 656static void 657regtail(char *p, char *val) 658{ 659 register char *scan; 660 register char *temp; 661 register int offset; 662 663 if (p == ®dummy) 664 return; 665 666 /* Find last node. */ 667 scan = p; 668 for (;;) { 669 temp = regnext(scan); 670 if (temp == NULL) 671 break; 672 scan = temp; 673 } 674 675 if (OP(scan) == BACK) 676 offset = scan - val; 677 else 678 offset = val - scan; 679 *(scan+1) = (offset>>8)&0377; 680 *(scan+2) = offset&0377; 681} 682 683/* 684 - regoptail - regtail on operand of first argument; nop if operandless 685 */ 686static void 687regoptail(char *p, char *val) 688{ 689 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 690 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 691 return; 692 regtail(OPERAND(p), val); 693} 694 695/* 696 * regexec and friends 697 */ 698 699/* 700 * Global work variables for regexec(). 701 */ 702static char *reginput; /* String-input pointer. */ 703static char *regbol; /* Beginning of input, for ^ check. */ 704static char **regstartp; /* Pointer to startp array. */ 705static char **regendp; /* Ditto for endp. */ 706 707/* 708 * Forwards. 709 */ 710STATIC int regtry(regexp *prog, char *string); 711STATIC int regmatch(char *prog); 712STATIC int regrepeat(char *p); 713 714#ifdef DEBUG 715int regnarrate = 0; 716void regdump(); 717STATIC char *regprop(char *op); 718#endif 719 720/* 721 - regexec - match a regexp against a string 722 */ 723int 724regexec(regexp *prog, char *string) 725{ 726 register char *s; 727 728 /* Be paranoid... */ 729 if (prog == NULL || string == NULL) { 730 printk("<3>Regexp: NULL parameter\n"); 731 return(0); 732 } 733 734 /* Check validity of program. */ 735 if (UCHARAT(prog->program) != MAGIC) { 736 printk("<3>Regexp: corrupted program\n"); 737 return(0); 738 } 739 740 /* If there is a "must appear" string, look for it. */ 741 if (prog->regmust != NULL) { 742 s = string; 743 while ((s = strchr(s, prog->regmust[0])) != NULL) { 744 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 745 break; /* Found it. */ 746 s++; 747 } 748 if (s == NULL) /* Not present. */ 749 return(0); 750 } 751 752 /* Mark beginning of line for ^ . */ 753 regbol = string; 754 755 /* Simplest case: anchored match need be tried only once. */ 756 if (prog->reganch) 757 return(regtry(prog, string)); 758 759 /* Messy cases: unanchored match. */ 760 s = string; 761 if (prog->regstart != '\0') 762 /* We know what char it must start with. */ 763 while ((s = strchr(s, prog->regstart)) != NULL) { 764 if (regtry(prog, s)) 765 return(1); 766 s++; 767 } 768 else 769 /* We don't -- general case. */ 770 do { 771 if (regtry(prog, s)) 772 return(1); 773 } while (*s++ != '\0'); 774 775 /* Failure. */ 776 return(0); 777} 778 779/* 780 - regtry - try match at specific point 781 */ 782static int /* 0 failure, 1 success */ 783regtry(regexp *prog, char *string) 784{ 785 register int i; 786 register char **sp; 787 register char **ep; 788 789 reginput = string; 790 regstartp = prog->startp; 791 regendp = prog->endp; 792 793 sp = prog->startp; 794 ep = prog->endp; 795 for (i = NSUBEXP; i > 0; i--) { 796 *sp++ = NULL; 797 *ep++ = NULL; 798 } 799 if (regmatch(prog->program + 1)) { 800 prog->startp[0] = string; 801 prog->endp[0] = reginput; 802 return(1); 803 } else 804 return(0); 805} 806 807/* 808 - regmatch - main matching routine 809 * 810 * Conceptually the strategy is simple: check to see whether the current 811 * node matches, call self recursively to see whether the rest matches, 812 * and then act accordingly. In practice we make some effort to avoid 813 * recursion, in particular by going through "ordinary" nodes (that don't 814 * need to know whether the rest of the match failed) by a loop instead of 815 * by recursion. 816 */ 817static int /* 0 failure, 1 success */ 818regmatch(char *prog) 819{ 820 register char *scan = prog; /* Current node. */ 821 char *next; /* Next node. */ 822 823#ifdef DEBUG 824 if (scan != NULL && regnarrate) 825 fprintf(stderr, "%s(\n", regprop(scan)); 826#endif 827 while (scan != NULL) { 828#ifdef DEBUG 829 if (regnarrate) 830 fprintf(stderr, "%s...\n", regprop(scan)); 831#endif 832 next = regnext(scan); 833 834 switch (OP(scan)) { 835 case BOL: 836 if (reginput != regbol) 837 return(0); 838 break; 839 case EOL: 840 if (*reginput != '\0') 841 return(0); 842 break; 843 case ANY: 844 if (*reginput == '\0') 845 return(0); 846 reginput++; 847 break; 848 case EXACTLY: { 849 register int len; 850 register char *opnd; 851 852 opnd = OPERAND(scan); 853 /* Inline the first character, for speed. */ 854 if (*opnd != *reginput) 855 return(0); 856 len = strlen(opnd); 857 if (len > 1 && strncmp(opnd, reginput, len) != 0) 858 return(0); 859 reginput += len; 860 } 861 break; 862 case ANYOF: 863 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 864 return(0); 865 reginput++; 866 break; 867 case ANYBUT: 868 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 869 return(0); 870 reginput++; 871 break; 872 case NOTHING: 873 case BACK: 874 break; 875 case OPEN+1: 876 case OPEN+2: 877 case OPEN+3: 878 case OPEN+4: 879 case OPEN+5: 880 case OPEN+6: 881 case OPEN+7: 882 case OPEN+8: 883 case OPEN+9: { 884 register int no; 885 register char *save; 886 887 no = OP(scan) - OPEN; 888 save = reginput; 889 890 if (regmatch(next)) { 891 /* 892 * Don't set startp if some later 893 * invocation of the same parentheses 894 * already has. 895 */ 896 if (regstartp[no] == NULL) 897 regstartp[no] = save; 898 return(1); 899 } else 900 return(0); 901 } 902 break; 903 case CLOSE+1: 904 case CLOSE+2: 905 case CLOSE+3: 906 case CLOSE+4: 907 case CLOSE+5: 908 case CLOSE+6: 909 case CLOSE+7: 910 case CLOSE+8: 911 case CLOSE+9: 912 { 913 register int no; 914 register char *save; 915 916 no = OP(scan) - CLOSE; 917 save = reginput; 918 919 if (regmatch(next)) { 920 /* 921 * Don't set endp if some later 922 * invocation of the same parentheses 923 * already has. 924 */ 925 if (regendp[no] == NULL) 926 regendp[no] = save; 927 return(1); 928 } else 929 return(0); 930 } 931 break; 932 case BRANCH: { 933 register char *save; 934 935 if (OP(next) != BRANCH) /* No choice. */ 936 next = OPERAND(scan); /* Avoid recursion. */ 937 else { 938 do { 939 save = reginput; 940 if (regmatch(OPERAND(scan))) 941 return(1); 942 reginput = save; 943 scan = regnext(scan); 944 } while (scan != NULL && OP(scan) == BRANCH); 945 return(0); 946 /* NOTREACHED */ 947 } 948 } 949 break; 950 case STAR: 951 case PLUS: { 952 register char nextch; 953 register int no; 954 register char *save; 955 register int min; 956 957 /* 958 * Lookahead to avoid useless match attempts 959 * when we know what character comes next. 960 */ 961 nextch = '\0'; 962 if (OP(next) == EXACTLY) 963 nextch = *OPERAND(next); 964 min = (OP(scan) == STAR) ? 0 : 1; 965 save = reginput; 966 no = regrepeat(OPERAND(scan)); 967 while (no >= min) { 968 /* If it could work, try it. */ 969 if (nextch == '\0' || *reginput == nextch) 970 if (regmatch(next)) 971 return(1); 972 /* Couldn't or didn't -- back up. */ 973 no--; 974 reginput = save + no; 975 } 976 return(0); 977 } 978 break; 979 case END: 980 return(1); /* Success! */ 981 break; 982 default: 983 printk("<3>Regexp: memory corruption\n"); 984 return(0); 985 break; 986 } 987 988 scan = next; 989 } 990 991 /* 992 * We get here only if there's trouble -- normally "case END" is 993 * the terminating point. 994 */ 995 printk("<3>Regexp: corrupted pointers\n"); 996 return(0); 997} 998 999/* 1000 - regrepeat - repeatedly match something simple, report how many 1001 */ 1002static int 1003regrepeat(char *p) 1004{ 1005 register int count = 0; 1006 register char *scan; 1007 register char *opnd; 1008 1009 scan = reginput; 1010 opnd = OPERAND(p); 1011 switch (OP(p)) { 1012 case ANY: 1013 count = strlen(scan); 1014 scan += count; 1015 break; 1016 case EXACTLY: 1017 while (*opnd == *scan) { 1018 count++; 1019 scan++; 1020 } 1021 break; 1022 case ANYOF: 1023 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1024 count++; 1025 scan++; 1026 } 1027 break; 1028 case ANYBUT: 1029 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1030 count++; 1031 scan++; 1032 } 1033 break; 1034 default: /* Oh dear. Called inappropriately. */ 1035 printk("<3>Regexp: internal foulup\n"); 1036 count = 0; /* Best compromise. */ 1037 break; 1038 } 1039 reginput = scan; 1040 1041 return(count); 1042} 1043 1044/* 1045 - regnext - dig the "next" pointer out of a node 1046 */ 1047static char* 1048regnext(char *p) 1049{ 1050 register int offset; 1051 1052 if (p == ®dummy) 1053 return(NULL); 1054 1055 offset = NEXT(p); 1056 if (offset == 0) 1057 return(NULL); 1058 1059 if (OP(p) == BACK) 1060 return(p-offset); 1061 else 1062 return(p+offset); 1063} 1064 1065#ifdef DEBUG 1066 1067STATIC char *regprop(); 1068 1069/* 1070 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1071 */ 1072void 1073regdump(regexp *r) 1074{ 1075 register char *s; 1076 register char op = EXACTLY; /* Arbitrary non-END op. */ 1077 register char *next; 1078 /* extern char *strchr(); */ 1079 1080 1081 s = r->program + 1; 1082 while (op != END) { /* While that wasn't END last time... */ 1083 op = OP(s); 1084 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1085 next = regnext(s); 1086 if (next == NULL) /* Next ptr. */ 1087 printf("(0)"); 1088 else 1089 printf("(%d)", (s-r->program)+(next-s)); 1090 s += 3; 1091 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1092 /* Literal string, where present. */ 1093 while (*s != '\0') { 1094 putchar(*s); 1095 s++; 1096 } 1097 s++; 1098 } 1099 putchar('\n'); 1100 } 1101 1102 /* Header fields of interest. */ 1103 if (r->regstart != '\0') 1104 printf("start `%c' ", r->regstart); 1105 if (r->reganch) 1106 printf("anchored "); 1107 if (r->regmust != NULL) 1108 printf("must have \"%s\"", r->regmust); 1109 printf("\n"); 1110} 1111 1112/* 1113 - regprop - printable representation of opcode 1114 */ 1115static char * 1116regprop(char *op) 1117{ 1118#define BUFLEN 50 1119 register char *p; 1120 static char buf[BUFLEN]; 1121 1122 strcpy(buf, ":"); 1123 1124 switch (OP(op)) { 1125 case BOL: 1126 p = "BOL"; 1127 break; 1128 case EOL: 1129 p = "EOL"; 1130 break; 1131 case ANY: 1132 p = "ANY"; 1133 break; 1134 case ANYOF: 1135 p = "ANYOF"; 1136 break; 1137 case ANYBUT: 1138 p = "ANYBUT"; 1139 break; 1140 case BRANCH: 1141 p = "BRANCH"; 1142 break; 1143 case EXACTLY: 1144 p = "EXACTLY"; 1145 break; 1146 case NOTHING: 1147 p = "NOTHING"; 1148 break; 1149 case BACK: 1150 p = "BACK"; 1151 break; 1152 case END: 1153 p = "END"; 1154 break; 1155 case OPEN+1: 1156 case OPEN+2: 1157 case OPEN+3: 1158 case OPEN+4: 1159 case OPEN+5: 1160 case OPEN+6: 1161 case OPEN+7: 1162 case OPEN+8: 1163 case OPEN+9: 1164 snprintf(buf+strlen(buf),BUFLEN-strlen(buf), "OPEN%d", OP(op)-OPEN); 1165 p = NULL; 1166 break; 1167 case CLOSE+1: 1168 case CLOSE+2: 1169 case CLOSE+3: 1170 case CLOSE+4: 1171 case CLOSE+5: 1172 case CLOSE+6: 1173 case CLOSE+7: 1174 case CLOSE+8: 1175 case CLOSE+9: 1176 snprintf(buf+strlen(buf),BUFLEN-strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1177 p = NULL; 1178 break; 1179 case STAR: 1180 p = "STAR"; 1181 break; 1182 case PLUS: 1183 p = "PLUS"; 1184 break; 1185 default: 1186 printk("<3>Regexp: corrupted opcode\n"); 1187 break; 1188 } 1189 if (p != NULL) 1190 strncat(buf, p, BUFLEN-strlen(buf)); 1191 return(buf); 1192} 1193#endif 1194 1195 1196