1/* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22 *** to assist in implementing egrep. 23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25 *** as in BSD grep and ex. 26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30 * 31 * Beware that some of this code is subtly aware of the way operator 32 * precedence is structured in regular expressions. Serious changes in 33 * regular-expression syntax might require a total rethink. 34 */ 35 36#include <sys/cdefs.h> 37#if defined(LIBC_SCCS) && !defined(lint) 38__RCSID("$NetBSD: regexp.c,v 1.19 2016/01/26 16:05:18 christos Exp $"); 39#endif /* LIBC_SCCS and not lint */ 40 41#include <ctype.h> 42#include <regexp.h> 43#include <stdio.h> 44#include <stdlib.h> 45#include <string.h> 46#include "regmagic.h" 47 48/* 49 * The "internal use only" fields in regexp.h are present to pass info from 50 * compile to execute that permits the execute phase to run lots faster on 51 * simple cases. They are: 52 * 53 * regstart char that must begin a match; '\0' if none obvious 54 * reganch is the match anchored (at beginning-of-line only)? 55 * regmust string (pointer into program) that match must include, or NULL 56 * regmlen length of regmust string 57 * 58 * Regstart and reganch permit very fast decisions on suitable starting points 59 * for a match, cutting down the work a lot. Regmust permits fast rejection 60 * of lines that cannot possibly match. The regmust tests are costly enough 61 * that regcomp() supplies a regmust only if the r.e. contains something 62 * potentially expensive (at present, the only such thing detected is * or + 63 * at the start of the r.e., which can involve a lot of backup). Regmlen is 64 * supplied because the test in regexec() needs it and regcomp() is computing 65 * it anyway. 66 */ 67 68/* 69 * Structure for regexp "program". This is essentially a linear encoding 70 * of a nondeterministic finite-state machine (aka syntax charts or 71 * "railroad normal form" in parsing technology). Each node is an opcode 72 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 73 * all nodes except BRANCH implement concatenation; a "next" pointer with 74 * a BRANCH on both ends of it is connecting two alternatives. (Here we 75 * have one of the subtle syntax dependencies: an individual BRANCH (as 76 * opposed to a collection of them) is never concatenated with anything 77 * because of operator precedence.) The operand of some types of node is 78 * a literal string; for others, it is a node leading into a sub-FSM. In 79 * particular, the operand of a BRANCH node is the first node of the branch. 80 * (NB this is *not* a tree structure: the tail of the branch connects 81 * to the thing following the set of BRANCHes.) The opcodes are: 82 */ 83 84/* definition number opnd? meaning */ 85#define END 0 /* no End of program. */ 86#define BOL 1 /* no Match "" at beginning of line. */ 87#define EOL 2 /* no Match "" at end of line. */ 88#define ANY 3 /* no Match any one character. */ 89#define ANYOF 4 /* str Match any character in this string. */ 90#define ANYBUT 5 /* str Match any character not in this string. */ 91#define BRANCH 6 /* node Match this alternative, or the next... */ 92#define BACK 7 /* no Match "", "next" ptr points backward. */ 93#define EXACTLY 8 /* str Match this string. */ 94#define NOTHING 9 /* no Match empty string. */ 95#define STAR 10 /* node Match this (simple) thing 0 or more times. */ 96#define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 97#define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 98#define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 99#define OPEN 20 /* no Mark this point in input as start of #n. */ 100 /* OPEN+1 is number 1, etc. */ 101#define CLOSE 30 /* no Analogous to OPEN. */ 102 103/* 104 * Opcode notes: 105 * 106 * BRANCH The set of branches constituting a single choice are hooked 107 * together with their "next" pointers, since precedence prevents 108 * anything being concatenated to any individual branch. The 109 * "next" pointer of the last BRANCH in a choice points to the 110 * thing following the whole choice. This is also where the 111 * final "next" pointer of each individual branch points; each 112 * branch starts with the operand node of a BRANCH node. 113 * 114 * BACK Normal "next" pointers all implicitly point forward; BACK 115 * exists to make loop structures possible. 116 * 117 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 118 * BRANCH structures using BACK. Simple cases (one character 119 * per match) are implemented with STAR and PLUS for speed 120 * and to minimize recursive plunges. 121 * 122 * OPEN,CLOSE ...are numbered at compile time. 123 */ 124 125/* 126 * A node is one char of opcode followed by two chars of "next" pointer. 127 * "Next" pointers are stored as two 8-bit pieces, high order first. The 128 * value is a positive offset from the opcode of the node containing it. 129 * An operand, if any, simply follows the node. (Note that much of the 130 * code generation knows about this implicit relationship.) 131 * 132 * Using two bytes for the "next" pointer is vast overkill for most things, 133 * but allows patterns to get big without disasters. 134 */ 135#define OP(p) (*(p)) 136#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 137#define OPERAND(p) ((p) + 3) 138 139/* 140 * See regmagic.h for one further detail of program structure. 141 */ 142 143 144/* 145 * Utility definitions. 146 */ 147#ifndef CHARBITS 148#define UCHARAT(p) ((int)*(unsigned char *)(p)) 149#else 150#define UCHARAT(p) ((int)*(p)&CHARBITS) 151#endif 152 153#define FAIL(m) { regerror(m); return(NULL); } 154#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 155 156/* 157 * Flags to be passed up and down. 158 */ 159#define HASWIDTH 01 /* Known never to match null string. */ 160#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 161#define SPSTART 04 /* Starts with * or +. */ 162#define WORST 0 /* Worst case. */ 163 164/* 165 * Global work variables for regcomp(). 166 */ 167static char *regparse; /* Input-scan pointer. */ 168static int regnpar; /* () count. */ 169static char regdummy; 170static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 171static long regsize; /* Code size. */ 172 173/* 174 * Forward declarations for regcomp()'s friends. 175 */ 176#ifndef STATIC 177#define STATIC static 178#endif 179STATIC char *reg __P((int, int *)); 180STATIC char *regbranch __P((int *)); 181STATIC char *regpiece __P((int *)); 182STATIC char *regatom __P((int *)); 183STATIC char *regnode __P((int)); 184STATIC char *regnext __P((char *)); 185STATIC void regc __P((int)); 186STATIC void reginsert __P((int, char *)); 187STATIC void regtail __P((char *, char *)); 188STATIC void regoptail __P((char *, char *)); 189#ifdef STRCSPN 190STATIC int strcspn __P((char *, char *)); 191#endif 192 193/* 194 - regcomp - compile a regular expression into internal code 195 * 196 * We can't allocate space until we know how big the compiled form will be, 197 * but we can't compile it (and thus know how big it is) until we've got a 198 * place to put the code. So we cheat: we compile it twice, once with code 199 * generation turned off and size counting turned on, and once "for real". 200 * This also means that we don't allocate space until we are sure that the 201 * thing really will compile successfully, and we never have to move the 202 * code and thus invalidate pointers into it. (Note that it has to be in 203 * one piece because free() must be able to free it all.) 204 * 205 * Beware that the optimization-preparation code in here knows about some 206 * of the structure of the compiled regexp. 207 */ 208regexp * 209__compat_regcomp(expn) 210const char *expn; 211{ 212 regexp *r; 213 char *scan; 214 char *longest; 215 int len; 216 int flags; 217 218 if (expn == NULL) 219 FAIL("NULL argument"); 220 221 /* First pass: determine size, legality. */ 222#ifdef notdef 223 if (expn[0] == '.' && expn[1] == '*') expn += 2; /* aid grep */ 224#endif 225 /* LINTED const castaway */ 226 regparse = (char *)expn; 227 regnpar = 1; 228 regsize = 0L; 229 regcode = ®dummy; 230 regc(MAGIC); 231 if (reg(0, &flags) == NULL) 232 return(NULL); 233 234 /* Small enough for pointer-storage convention? */ 235 if (regsize >= 32767L) /* Probably could be 65535L. */ 236 FAIL("regexp too big"); 237 238 /* Allocate space. */ 239 r = malloc(sizeof(regexp) + (unsigned)regsize); 240 if (r == NULL) 241 FAIL("out of space"); 242 243 /* Second pass: emit code. */ 244 /* LINTED const castaway */ 245 regparse = (char *)expn; 246 regnpar = 1; 247 regcode = r->program; 248 regc(MAGIC); 249 if (reg(0, &flags) == NULL) { 250 free(r); 251 return(NULL); 252 } 253 254 /* Dig out information for optimizations. */ 255 r->regstart = '\0'; /* Worst-case defaults. */ 256 r->reganch = 0; 257 r->regmust = NULL; 258 r->regmlen = 0; 259 scan = r->program+1; /* First BRANCH. */ 260 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 261 scan = OPERAND(scan); 262 263 /* Starting-point info. */ 264 if (OP(scan) == EXACTLY) 265 r->regstart = *OPERAND(scan); 266 else if (OP(scan) == BOL) 267 r->reganch++; 268 269 /* 270 * If there's something expensive in the r.e., find the 271 * longest literal string that must appear and make it the 272 * regmust. Resolve ties in favor of later strings, since 273 * the regstart check works with the beginning of the r.e. 274 * and avoiding duplication strengthens checking. Not a 275 * strong reason, but sufficient in the absence of others. 276 */ 277 if (flags&SPSTART) { 278 longest = NULL; 279 len = 0; 280 for (; scan != NULL; scan = regnext(scan)) 281 if (OP(scan) == EXACTLY && (int) strlen(OPERAND(scan)) >= len) { 282 longest = OPERAND(scan); 283 len = strlen(OPERAND(scan)); 284 } 285 r->regmust = longest; 286 r->regmlen = len; 287 } 288 } 289 290 return(r); 291} 292 293/* 294 - reg - regular expression, i.e. main body or parenthesized thing 295 * 296 * Caller must absorb opening parenthesis. 297 * 298 * Combining parenthesis handling with the base level of regular expression 299 * is a trifle forced, but the need to tie the tails of the branches to what 300 * follows makes it hard to avoid. 301 */ 302static char * 303reg(paren, flagp) 304int paren; /* Parenthesized? */ 305int *flagp; 306{ 307 char *ret; 308 char *br; 309 char *ender; 310 int parno = 0; 311 int flags; 312 313 *flagp = HASWIDTH; /* Tentatively. */ 314 315 /* Make an OPEN node, if parenthesized. */ 316 if (paren) { 317 if (regnpar >= NSUBEXP) 318 FAIL("too many ()"); 319 parno = regnpar; 320 regnpar++; 321 ret = regnode(OPEN+parno); 322 } else 323 ret = NULL; 324 325 /* Pick up the branches, linking them together. */ 326 br = regbranch(&flags); 327 if (br == NULL) 328 return(NULL); 329 if (ret != NULL) 330 regtail(ret, br); /* OPEN -> first. */ 331 else 332 ret = br; 333 if (!(flags&HASWIDTH)) 334 *flagp &= ~HASWIDTH; 335 *flagp |= flags&SPSTART; 336 while (*regparse == '|' || *regparse == '\n') { 337 regparse++; 338 br = regbranch(&flags); 339 if (br == NULL) 340 return(NULL); 341 regtail(ret, br); /* BRANCH -> BRANCH. */ 342 if (!(flags&HASWIDTH)) 343 *flagp &= ~HASWIDTH; 344 *flagp |= flags&SPSTART; 345 } 346 347 /* Make a closing node, and hook it on the end. */ 348 ender = regnode((paren) ? CLOSE+parno : END); 349 regtail(ret, ender); 350 351 /* Hook the tails of the branches to the closing node. */ 352 for (br = ret; br != NULL; br = regnext(br)) 353 regoptail(br, ender); 354 355 /* Check for proper termination. */ 356 if (paren && *regparse++ != ')') { 357 FAIL("unmatched ()"); 358 } else if (!paren && *regparse != '\0') { 359 if (*regparse == ')') { 360 FAIL("unmatched ()"); 361 } else 362 FAIL("junk on end"); /* "Can't happen". */ 363 /* NOTREACHED */ 364 } 365 366 return(ret); 367} 368 369/* 370 - regbranch - one alternative of an | operator 371 * 372 * Implements the concatenation operator. 373 */ 374static char * 375regbranch(flagp) 376int *flagp; 377{ 378 char *ret; 379 char *chain; 380 char *latest; 381 int flags; 382 383 *flagp = WORST; /* Tentatively. */ 384 385 ret = regnode(BRANCH); 386 chain = NULL; 387 while (*regparse != '\0' && *regparse != ')' && 388 *regparse != '\n' && *regparse != '|') { 389 latest = regpiece(&flags); 390 if (latest == NULL) 391 return(NULL); 392 *flagp |= flags&HASWIDTH; 393 if (chain == NULL) /* First piece. */ 394 *flagp |= flags&SPSTART; 395 else 396 regtail(chain, latest); 397 chain = latest; 398 } 399 if (chain == NULL) /* Loop ran zero times. */ 400 (void) regnode(NOTHING); 401 402 return(ret); 403} 404 405/* 406 - regpiece - something followed by possible [*+?] 407 * 408 * Note that the branching code sequences used for ? and the general cases 409 * of * and + are somewhat optimized: they use the same NOTHING node as 410 * both the endmarker for their branch list and the body of the last branch. 411 * It might seem that this node could be dispensed with entirely, but the 412 * endmarker role is not redundant. 413 */ 414static char * 415regpiece(flagp) 416int *flagp; 417{ 418 char *ret; 419 char op; 420 char *next; 421 int flags; 422 423 ret = regatom(&flags); 424 if (ret == NULL) 425 return(NULL); 426 427 op = *regparse; 428 if (!ISMULT(op)) { 429 *flagp = flags; 430 return(ret); 431 } 432 433 if (!(flags&HASWIDTH) && op != '?') 434 FAIL("*+ operand could be empty"); 435 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 436 437 if (op == '*' && (flags&SIMPLE)) 438 reginsert(STAR, ret); 439 else if (op == '*') { 440 /* Emit x* as (x&|), where & means "self". */ 441 reginsert(BRANCH, ret); /* Either x */ 442 regoptail(ret, regnode(BACK)); /* and loop */ 443 regoptail(ret, ret); /* back */ 444 regtail(ret, regnode(BRANCH)); /* or */ 445 regtail(ret, regnode(NOTHING)); /* null. */ 446 } else if (op == '+' && (flags&SIMPLE)) 447 reginsert(PLUS, ret); 448 else if (op == '+') { 449 /* Emit x+ as x(&|), where & means "self". */ 450 next = regnode(BRANCH); /* Either */ 451 regtail(ret, next); 452 regtail(regnode(BACK), ret); /* loop back */ 453 regtail(next, regnode(BRANCH)); /* or */ 454 regtail(ret, regnode(NOTHING)); /* null. */ 455 } else if (op == '?') { 456 /* Emit x? as (x|) */ 457 reginsert(BRANCH, ret); /* Either x */ 458 regtail(ret, regnode(BRANCH)); /* or */ 459 next = regnode(NOTHING); /* null. */ 460 regtail(ret, next); 461 regoptail(ret, next); 462 } 463 regparse++; 464 if (ISMULT(*regparse)) 465 FAIL("nested *?+"); 466 467 return(ret); 468} 469 470/* 471 - regatom - the lowest level 472 * 473 * Optimization: gobbles an entire sequence of ordinary characters so that 474 * it can turn them into a single node, which is smaller to store and 475 * faster to run. Backslashed characters are exceptions, each becoming a 476 * separate node; the code is simpler that way and it's not worth fixing. 477 */ 478static char * 479regatom(flagp) 480int *flagp; 481{ 482 char *ret; 483 int flags; 484 485 *flagp = WORST; /* Tentatively. */ 486 487 switch (*regparse++) { 488 /* FIXME: these chars only have meaning at beg/end of pat? */ 489 case '^': 490 ret = regnode(BOL); 491 break; 492 case '$': 493 ret = regnode(EOL); 494 break; 495 case '.': 496 ret = regnode(ANY); 497 *flagp |= HASWIDTH|SIMPLE; 498 break; 499 case '[': { 500 int class; 501 int classend; 502 503 if (*regparse == '^') { /* Complement of range. */ 504 ret = regnode(ANYBUT); 505 regparse++; 506 } else 507 ret = regnode(ANYOF); 508 if (*regparse == ']' || *regparse == '-') 509 regc(*regparse++); 510 while (*regparse != '\0' && *regparse != ']') { 511 if (*regparse == '-') { 512 regparse++; 513 if (*regparse == ']' || *regparse == '\0') 514 regc('-'); 515 else { 516 class = UCHARAT(regparse-2)+1; 517 classend = UCHARAT(regparse); 518 if (class > classend+1) 519 FAIL("invalid [] range"); 520 for (; class <= classend; class++) 521 regc(class); 522 regparse++; 523 } 524 } else 525 regc(*regparse++); 526 } 527 regc('\0'); 528 if (*regparse != ']') 529 FAIL("unmatched []"); 530 regparse++; 531 *flagp |= HASWIDTH|SIMPLE; 532 } 533 break; 534 case '(': 535 ret = reg(1, &flags); 536 if (ret == NULL) 537 return(NULL); 538 *flagp |= flags&(HASWIDTH|SPSTART); 539 break; 540 case '\0': 541 case '|': 542 case '\n': 543 case ')': 544 FAIL("internal urp"); /* Supposed to be caught earlier. */ 545 case '?': 546 case '+': 547 case '*': 548 FAIL("?+* follows nothing"); 549 case '\\': 550 switch (*regparse++) { 551 case '\0': 552 FAIL("trailing \\"); 553 case '<': 554 ret = regnode(WORDA); 555 break; 556 case '>': 557 ret = regnode(WORDZ); 558 break; 559 /* FIXME: Someday handle \1, \2, ... */ 560 default: 561 /* Handle general quoted chars in exact-match routine */ 562 goto de_fault; 563 } 564 break; 565 de_fault: 566 /*FALLTHROUGH*/ 567 default: 568 /* 569 * Encode a string of characters to be matched exactly. 570 * 571 * This is a bit tricky due to quoted chars and due to 572 * '*', '+', and '?' taking the SINGLE char previous 573 * as their operand. 574 * 575 * On entry, the char at regparse[-1] is going to go 576 * into the string, no matter what it is. (It could be 577 * following a \ if we are entered from the '\' case.) 578 * 579 * Basic idea is to pick up a good char in ch and 580 * examine the next char. If it's *+? then we twiddle. 581 * If it's \ then we frozzle. If it's other magic char 582 * we push ch and terminate the string. If none of the 583 * above, we push ch on the string and go around again. 584 * 585 * regprev is used to remember where "the current char" 586 * starts in the string, if due to a *+? we need to back 587 * up and put the current char in a separate, 1-char, string. 588 * When regprev is NULL, ch is the only char in the 589 * string; this is used in *+? handling, and in setting 590 * flags |= SIMPLE at the end. 591 */ 592 { 593 char *regprev; 594 char ch; 595 596 regparse--; /* Look at cur char */ 597 ret = regnode(EXACTLY); 598 for ( regprev = 0 ; ; ) { 599 ch = *regparse++; /* Get current char */ 600 switch (*regparse) { /* look at next one */ 601 602 default: 603 regc(ch); /* Add cur to string */ 604 break; 605 606 case '.': case '[': case '(': 607 case ')': case '|': case '\n': 608 case '$': case '^': 609 case '\0': 610 /* FIXME, $ and ^ should not always be magic */ 611 magic: 612 regc(ch); /* dump cur char */ 613 goto done; /* and we are done */ 614 615 case '?': case '+': case '*': 616 if (!regprev) /* If just ch in str, */ 617 goto magic; /* use it */ 618 /* End mult-char string one early */ 619 regparse = regprev; /* Back up parse */ 620 goto done; 621 622 case '\\': 623 regc(ch); /* Cur char OK */ 624 switch (regparse[1]){ /* Look after \ */ 625 case '\0': 626 case '<': 627 case '>': 628 /* FIXME: Someday handle \1, \2, ... */ 629 goto done; /* Not quoted */ 630 default: 631 /* Backup point is \, scan * point is after it. */ 632 regprev = regparse; 633 regparse++; 634 continue; /* NOT break; */ 635 } 636 } 637 regprev = regparse; /* Set backup point */ 638 } 639 done: 640 regc('\0'); 641 *flagp |= HASWIDTH; 642 if (!regprev) /* One char? */ 643 *flagp |= SIMPLE; 644 } 645 break; 646 } 647 648 return(ret); 649} 650 651/* 652 - regnode - emit a node 653 */ 654static char * /* Location. */ 655regnode(op) 656int op; 657{ 658 char *ret; 659 char *ptr; 660 661 ret = regcode; 662 if (ret == ®dummy) { 663 regsize += 3; 664 return(ret); 665 } 666 667 ptr = ret; 668 *ptr++ = op; 669 *ptr++ = '\0'; /* Null "next" pointer. */ 670 *ptr++ = '\0'; 671 regcode = ptr; 672 673 return(ret); 674} 675 676/* 677 - regc - emit (if appropriate) a byte of code 678 */ 679static void 680regc(b) 681int b; 682{ 683 if (regcode != ®dummy) 684 *regcode++ = b; 685 else 686 regsize++; 687} 688 689/* 690 - reginsert - insert an operator in front of already-emitted operand 691 * 692 * Means relocating the operand. 693 */ 694static void 695reginsert(op, opnd) 696int op; 697char *opnd; 698{ 699 char *src; 700 char *dst; 701 char *place; 702 703 if (regcode == ®dummy) { 704 regsize += 3; 705 return; 706 } 707 708 src = regcode; 709 regcode += 3; 710 dst = regcode; 711 while (src > opnd) 712 *--dst = *--src; 713 714 place = opnd; /* Op node, where operand used to be. */ 715 *place++ = op; 716 *place++ = '\0'; 717 *place++ = '\0'; 718} 719 720/* 721 - regtail - set the next-pointer at the end of a node chain 722 */ 723static void 724regtail(p, val) 725char *p; 726char *val; 727{ 728 char *scan; 729 char *temp; 730 int offset; 731 732 if (p == ®dummy) 733 return; 734 735 /* Find last node. */ 736 scan = p; 737 for (;;) { 738 temp = regnext(scan); 739 if (temp == NULL) 740 break; 741 scan = temp; 742 } 743 744 if (OP(scan) == BACK) 745 offset = scan - val; 746 else 747 offset = val - scan; 748 *(scan+1) = ((unsigned int)offset>>8)&0377; 749 *(scan+2) = offset&0377; 750} 751 752/* 753 - regoptail - regtail on operand of first argument; nop if operandless 754 */ 755static void 756regoptail(p, val) 757char *p; 758char *val; 759{ 760 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 761 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 762 return; 763 regtail(OPERAND(p), val); 764} 765 766/* 767 * regexec and friends 768 */ 769 770/* 771 * Global work variables for regexec(). 772 */ 773static char *reginput; /* String-input pointer. */ 774static char *regbol; /* Beginning of input, for ^ check. */ 775static char **regstartp; /* Pointer to startp array. */ 776static char **regendp; /* Ditto for endp. */ 777 778/* 779 * Forwards. 780 */ 781STATIC int regtry __P((const regexp *, const char *)); 782STATIC int regmatch __P((char *)); 783STATIC int regrepeat __P((char *)); 784 785#ifdef DEBUG 786int regnarrate = 0; 787void regdump __P((regexp *)); 788STATIC char *regprop __P((char *)); 789#endif 790 791/* 792 - regexec - match a regexp against a string 793 */ 794int 795__compat_regexec(prog, string) 796const regexp *prog; 797const char *string; 798{ 799 char *s; 800 801 /* Be paranoid... */ 802 if (prog == NULL || string == NULL) { 803 regerror("NULL parameter"); 804 return(0); 805 } 806 807 /* Check validity of program. */ 808 if (UCHARAT(prog->program) != MAGIC) { 809 regerror("corrupted program"); 810 return(0); 811 } 812 813 /* If there is a "must appear" string, look for it. */ 814 if (prog->regmust != NULL) { 815 /* LINTED const castaway */ 816 s = (char *)string; 817 while ((s = strchr(s, prog->regmust[0])) != NULL) { 818 if (strncmp(s, prog->regmust, 819 (size_t)prog->regmlen) == 0) 820 break; /* Found it. */ 821 s++; 822 } 823 if (s == NULL) /* Not present. */ 824 return(0); 825 } 826 827 /* Mark beginning of line for ^ . */ 828 /* LINTED const castaway */ 829 regbol = (char *)string; 830 831 /* Simplest case: anchored match need be tried only once. */ 832 if (prog->reganch) 833 return(regtry(prog, string)); 834 835 /* Messy cases: unanchored match. */ 836 /* LINTED const castaway */ 837 s = (char *)string; 838 if (prog->regstart != '\0') 839 /* We know what char it must start with. */ 840 while ((s = strchr(s, prog->regstart)) != NULL) { 841 if (regtry(prog, s)) 842 return(1); 843 s++; 844 } 845 else 846 /* We don't -- general case. */ 847 do { 848 if (regtry(prog, s)) 849 return(1); 850 } while (*s++ != '\0'); 851 852 /* Failure. */ 853 return(0); 854} 855 856/* 857 - regtry - try match at specific point 858 */ 859static int /* 0 failure, 1 success */ 860regtry(prog, string) 861const regexp *prog; 862const char *string; 863{ 864 int i; 865 char **sp; 866 char **ep; 867 868 /* LINTED const castaway */ 869 reginput = (char *)string; /* XXX */ 870 regstartp = (char **)prog->startp; /* XXX */ 871 regendp = (char **)prog->endp; /* XXX */ 872 873 sp = (char **)prog->startp; /* XXX */ 874 ep = (char **)prog->endp; /* XXX */ 875 for (i = NSUBEXP; i > 0; i--) { 876 *sp++ = NULL; 877 *ep++ = NULL; 878 } 879 if (regmatch((char *)prog->program + 1)) { /* XXX */ 880 /* LINTED const castaway */ 881 ((regexp *)prog)->startp[0] = (char *)string; /* XXX */ 882 /* LINTED const castaway */ 883 ((regexp *)prog)->endp[0] = reginput; /* XXX */ 884 return(1); 885 } else 886 return(0); 887} 888 889/* 890 - regmatch - main matching routine 891 * 892 * Conceptually the strategy is simple: check to see whether the current 893 * node matches, call self recursively to see whether the rest matches, 894 * and then act accordingly. In practice we make some effort to avoid 895 * recursion, in particular by going through "ordinary" nodes (that don't 896 * need to know whether the rest of the match failed) by a loop instead of 897 * by recursion. 898 */ 899static int /* 0 failure, 1 success */ 900regmatch(prog) 901char *prog; 902{ 903 char *scan; /* Current node. */ 904 char *next; /* Next node. */ 905 906 scan = prog; 907#ifdef DEBUG 908 if (scan != NULL && regnarrate) 909 fprintf(stderr, "%s(\n", regprop(scan)); 910#endif 911 while (scan != NULL) { 912#ifdef DEBUG 913 if (regnarrate) 914 fprintf(stderr, "%s...\n", regprop(scan)); 915#endif 916 next = regnext(scan); 917 918 switch (OP(scan)) { 919 case BOL: 920 if (reginput != regbol) 921 return(0); 922 break; 923 case EOL: 924 if (*reginput != '\0') 925 return(0); 926 break; 927 case WORDA: 928 /* Must be looking at a letter, digit, or _ */ 929 if ((!isalnum(UCHARAT(reginput))) && *reginput != '_') 930 return(0); 931 /* Prev must be BOL or nonword */ 932 if (reginput > regbol && 933 (isalnum(UCHARAT(reginput - 1)) 934 || reginput[-1] == '_')) 935 return(0); 936 break; 937 case WORDZ: 938 /* Must be looking at non letter, digit, or _ */ 939 if (isalnum(UCHARAT(reginput)) || *reginput == '_') 940 return(0); 941 /* We don't care what the previous char was */ 942 break; 943 case ANY: 944 if (*reginput == '\0') 945 return(0); 946 reginput++; 947 break; 948 case EXACTLY: { 949 int len; 950 char *opnd; 951 952 opnd = OPERAND(scan); 953 /* Inline the first character, for speed. */ 954 if (*opnd != *reginput) 955 return(0); 956 len = strlen(opnd); 957 if (len > 1 && strncmp(opnd, reginput, 958 (size_t)len) != 0) 959 return(0); 960 reginput += len; 961 } 962 break; 963 case ANYOF: 964 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 965 return(0); 966 reginput++; 967 break; 968 case ANYBUT: 969 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 970 return(0); 971 reginput++; 972 break; 973 case NOTHING: 974 break; 975 case BACK: 976 break; 977 case OPEN+1: 978 case OPEN+2: 979 case OPEN+3: 980 case OPEN+4: 981 case OPEN+5: 982 case OPEN+6: 983 case OPEN+7: 984 case OPEN+8: 985 case OPEN+9: { 986 int no; 987 char *save; 988 989 no = OP(scan) - OPEN; 990 save = reginput; 991 992 if (regmatch(next)) { 993 /* 994 * Don't set startp if some later 995 * invocation of the same parentheses 996 * already has. 997 */ 998 if (regstartp[no] == NULL) 999 regstartp[no] = save; 1000 return(1); 1001 } else 1002 return(0); 1003 } 1004 case CLOSE+1: 1005 case CLOSE+2: 1006 case CLOSE+3: 1007 case CLOSE+4: 1008 case CLOSE+5: 1009 case CLOSE+6: 1010 case CLOSE+7: 1011 case CLOSE+8: 1012 case CLOSE+9: { 1013 int no; 1014 char *save; 1015 1016 no = OP(scan) - CLOSE; 1017 save = reginput; 1018 1019 if (regmatch(next)) { 1020 /* 1021 * Don't set endp if some later 1022 * invocation of the same parentheses 1023 * already has. 1024 */ 1025 if (regendp[no] == NULL) 1026 regendp[no] = save; 1027 return(1); 1028 } else 1029 return(0); 1030 } 1031 case BRANCH: { 1032 char *save; 1033 1034 if (OP(next) != BRANCH) /* No choice. */ 1035 next = OPERAND(scan); /* Avoid recursion. */ 1036 else { 1037 do { 1038 save = reginput; 1039 if (regmatch(OPERAND(scan))) 1040 return(1); 1041 reginput = save; 1042 scan = regnext(scan); 1043 } while (scan != NULL && OP(scan) == BRANCH); 1044 return(0); 1045 /* NOTREACHED */ 1046 } 1047 } 1048 break; 1049 case STAR: 1050 case PLUS: { 1051 char nextch; 1052 int no; 1053 char *save; 1054 int min; 1055 1056 /* 1057 * Lookahead to avoid useless match attempts 1058 * when we know what character comes next. 1059 */ 1060 nextch = '\0'; 1061 if (OP(next) == EXACTLY) 1062 nextch = *OPERAND(next); 1063 min = (OP(scan) == STAR) ? 0 : 1; 1064 save = reginput; 1065 no = regrepeat(OPERAND(scan)); 1066 while (no >= min) { 1067 /* If it could work, try it. */ 1068 if (nextch == '\0' || *reginput == nextch) 1069 if (regmatch(next)) 1070 return(1); 1071 /* Couldn't or didn't -- back up. */ 1072 no--; 1073 reginput = save + no; 1074 } 1075 return(0); 1076 } 1077 case END: 1078 return(1); /* Success! */ 1079 default: 1080 regerror("memory corruption"); 1081 return(0); 1082 } 1083 1084 scan = next; 1085 } 1086 1087 /* 1088 * We get here only if there's trouble -- normally "case END" is 1089 * the terminating point. 1090 */ 1091 regerror("corrupted pointers"); 1092 return(0); 1093} 1094 1095/* 1096 - regrepeat - repeatedly match something simple, report how many 1097 */ 1098static int 1099regrepeat(p) 1100char *p; 1101{ 1102 int count = 0; 1103 char *scan; 1104 char *opnd; 1105 1106 scan = reginput; 1107 opnd = OPERAND(p); 1108 switch (OP(p)) { 1109 case ANY: 1110 count = strlen(scan); 1111 scan += count; 1112 break; 1113 case EXACTLY: 1114 while (*opnd == *scan) { 1115 count++; 1116 scan++; 1117 } 1118 break; 1119 case ANYOF: 1120 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1121 count++; 1122 scan++; 1123 } 1124 break; 1125 case ANYBUT: 1126 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1127 count++; 1128 scan++; 1129 } 1130 break; 1131 default: /* Oh dear. Called inappropriately. */ 1132 regerror("internal foulup"); 1133 count = 0; /* Best compromise. */ 1134 break; 1135 } 1136 reginput = scan; 1137 1138 return(count); 1139} 1140 1141/* 1142 - regnext - dig the "next" pointer out of a node 1143 */ 1144static char * 1145regnext(p) 1146char *p; 1147{ 1148 int offset; 1149 1150 if (p == ®dummy) 1151 return(NULL); 1152 1153 offset = NEXT(p); 1154 if (offset == 0) 1155 return(NULL); 1156 1157 if (OP(p) == BACK) 1158 return(p-offset); 1159 else 1160 return(p+offset); 1161} 1162 1163#ifdef DEBUG 1164 1165/* 1166 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1167 */ 1168void 1169regdump(r) 1170regexp *r; 1171{ 1172 char *s; 1173 char op = EXACTLY; /* Arbitrary non-END op. */ 1174 char *next; 1175 1176 1177 s = r->program + 1; 1178 while (op != END) { /* While that wasn't END last time... */ 1179 op = OP(s); 1180 printf("%2td%s", s-r->program, regprop(s)); /* Where, what. */ 1181 next = regnext(s); 1182 if (next == NULL) /* Next ptr. */ 1183 printf("(0)"); 1184 else 1185 printf("(%td)", (s-r->program)+(next-s)); 1186 s += 3; 1187 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1188 /* Literal string, where present. */ 1189 while (*s != '\0') { 1190 putchar(*s); 1191 s++; 1192 } 1193 s++; 1194 } 1195 putchar('\n'); 1196 } 1197 1198 /* Header fields of interest. */ 1199 if (r->regstart != '\0') 1200 printf("start `%c' ", r->regstart); 1201 if (r->reganch) 1202 printf("anchored "); 1203 if (r->regmust != NULL) 1204 printf("must have \"%s\"", r->regmust); 1205 printf("\n"); 1206} 1207 1208/* 1209 - regprop - printable representation of opcode 1210 */ 1211static char * 1212regprop(op) 1213char *op; 1214{ 1215 char *p; 1216 static char buf[50]; 1217 1218 (void)strncpy(buf, ":", sizeof(buf) - 1); 1219 1220 switch (OP(op)) { 1221 case BOL: 1222 p = "BOL"; 1223 break; 1224 case EOL: 1225 p = "EOL"; 1226 break; 1227 case ANY: 1228 p = "ANY"; 1229 break; 1230 case ANYOF: 1231 p = "ANYOF"; 1232 break; 1233 case ANYBUT: 1234 p = "ANYBUT"; 1235 break; 1236 case BRANCH: 1237 p = "BRANCH"; 1238 break; 1239 case EXACTLY: 1240 p = "EXACTLY"; 1241 break; 1242 case NOTHING: 1243 p = "NOTHING"; 1244 break; 1245 case BACK: 1246 p = "BACK"; 1247 break; 1248 case END: 1249 p = "END"; 1250 break; 1251 case OPEN+1: 1252 case OPEN+2: 1253 case OPEN+3: 1254 case OPEN+4: 1255 case OPEN+5: 1256 case OPEN+6: 1257 case OPEN+7: 1258 case OPEN+8: 1259 case OPEN+9: 1260 (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf), 1261 "OPEN%d", OP(op)-OPEN); 1262 p = NULL; 1263 break; 1264 case CLOSE+1: 1265 case CLOSE+2: 1266 case CLOSE+3: 1267 case CLOSE+4: 1268 case CLOSE+5: 1269 case CLOSE+6: 1270 case CLOSE+7: 1271 case CLOSE+8: 1272 case CLOSE+9: 1273 (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf), 1274 "CLOSE%d", OP(op)-CLOSE); 1275 p = NULL; 1276 break; 1277 case STAR: 1278 p = "STAR"; 1279 break; 1280 case PLUS: 1281 p = "PLUS"; 1282 break; 1283 case WORDA: 1284 p = "WORDA"; 1285 break; 1286 case WORDZ: 1287 p = "WORDZ"; 1288 break; 1289 default: 1290 p = NULL; 1291 regerror("corrupted opcode"); 1292 break; 1293 } 1294 if (p != NULL) 1295 (void)strncat(buf, p, sizeof(buf) - strlen(buf) - 1); 1296 return(buf); 1297} 1298#endif 1299 1300/* 1301 * The following is provided for those people who do not have strcspn() in 1302 * their C libraries. They should get off their butts and do something 1303 * about it; at least one public-domain implementation of those (highly 1304 * useful) string routines has been published on Usenet. 1305 */ 1306#ifdef STRCSPN 1307/* 1308 * strcspn - find length of initial segment of s1 consisting entirely 1309 * of characters not from s2 1310 */ 1311 1312static int 1313strcspn(s1, s2) 1314char *s1; 1315char *s2; 1316{ 1317 char *scan1; 1318 char *scan2; 1319 int count; 1320 1321 count = 0; 1322 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1323 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1324 if (*scan1 == *scan2++) 1325 return(count); 1326 count++; 1327 } 1328 return(count); 1329} 1330#endif 1331