1/**************************************************************** 2Copyright (C) Lucent Technologies 1997 3All Rights Reserved 4 5Permission to use, copy, modify, and distribute this software and 6its documentation for any purpose and without fee is hereby 7granted, provided that the above copyright notice appear in all 8copies and that both that the copyright notice and this 9permission notice and warranty disclaimer appear in supporting 10documentation, and that the name Lucent Technologies or any of 11its entities not be used in advertising or publicity pertaining 12to distribution of the software without specific, written prior 13permission. 14 15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22THIS SOFTWARE. 23****************************************************************/ 24 25/* lasciate ogne speranza, voi ch'intrate. */ 26 27#define DEBUG 28 29#include <ctype.h> 30#include <stdio.h> 31#include <string.h> 32#include <stdlib.h> 33#include "awk.h" 34#include "ytab.h" 35#include <fcntl.h> 36 37#define HAT (NCHARS+2) /* matches ^ in regular expr */ 38 /* NCHARS is 2**n */ 39#define MAXLIN 22 40 41#define type(v) (v)->nobj /* badly overloaded here */ 42#define info(v) (v)->ntype /* badly overloaded here */ 43#define left(v) (v)->narg[0] 44#define right(v) (v)->narg[1] 45#define parent(v) (v)->nnext 46 47#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 48#define ELEAF case EMPTYRE: /* empty string in regexp */ 49#define UNARY case STAR: case PLUS: case QUEST: 50 51/* encoding in tree Nodes: 52 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 53 left is index, right contains value or pointer to value 54 unary (STAR, PLUS, QUEST): left is child, right is null 55 binary (CAT, OR): left and right are children 56 parent contains pointer to parent 57*/ 58 59 60int *setvec; 61int *tmpset; 62int maxsetvec = 0; 63 64int rtok; /* next token in current re */ 65int rlxval; 66static uschar *rlxstr; 67static uschar *prestr; /* current position in current re */ 68static uschar *lastre; /* origin of last re */ 69static uschar *lastatom; /* origin of last Atom */ 70static uschar *starttok; 71static char *basestr; /* starts with original, replaced during 72 repetition processing */ 73static char *firstbasestr; 74 75static FILE * replogfile = 0; 76 77static int setcnt; 78static int poscnt; 79 80char *patbeg; 81int patlen; 82 83#define NFA 20 /* cache this many dynamic fa's */ 84fa *fatab[NFA]; 85int nfatab = 0; /* entries in fatab */ 86 87fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ 88{ 89 int i, use, nuse; 90 fa *pfa; 91 static int now = 1; 92 93 if (setvec == 0) { /* first time through any RE */ 94 maxsetvec = MAXLIN; 95 setvec = (int *) malloc(maxsetvec * sizeof(int)); 96 tmpset = (int *) malloc(maxsetvec * sizeof(int)); 97 if (setvec == 0 || tmpset == 0) 98 overflo("out of space initializing makedfa"); 99 } 100 101 if (compile_time) /* a constant for sure */ 102 return mkdfa(s, anchor); 103 for (i = 0; i < nfatab; i++) /* is it there already? */ 104 if (fatab[i]->anchor == anchor 105 && strcmp((const char *) fatab[i]->restr, s) == 0) { 106 fatab[i]->use = now++; 107 return fatab[i]; 108 } 109 pfa = mkdfa(s, anchor); 110 if (nfatab < NFA) { /* room for another */ 111 fatab[nfatab] = pfa; 112 fatab[nfatab]->use = now++; 113 nfatab++; 114 return pfa; 115 } 116 use = fatab[0]->use; /* replace least-recently used */ 117 nuse = 0; 118 for (i = 1; i < nfatab; i++) 119 if (fatab[i]->use < use) { 120 use = fatab[i]->use; 121 nuse = i; 122 } 123 freefa(fatab[nuse]); 124 fatab[nuse] = pfa; 125 pfa->use = now++; 126 return pfa; 127} 128 129fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ 130 /* anchor = 1 for anchored matches, else 0 */ 131{ 132 Node *p, *p1; 133 fa *f; 134 135 firstbasestr = (char *)s; 136 basestr = firstbasestr; 137 if (replogfile==0) { 138 /* disabled 139 replogfile = fopen("/tmp/repeatlog", "a"); 140 */ 141 } 142 p = reparse(s); 143 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 144 /* put ALL STAR in front of reg. exp. */ 145 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 146 /* put FINAL after reg. exp. */ 147 148 poscnt = 0; 149 penter(p1); /* enter parent pointers and leaf indices */ 150 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) 151 overflo("out of space for fa"); 152 f->accept = poscnt-1; /* penter has computed number of positions in re */ 153 cfoll(f, p1); /* set up follow sets */ 154 freetr(p1); 155 if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL) 156 overflo("out of space in makedfa"); 157 if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) 158 overflo("out of space in makedfa"); 159 *f->posns[1] = 0; 160 f->initstat = makeinit(f, anchor); 161 f->anchor = anchor; 162 f->restr = (uschar *) tostring(s); 163 if (replogfile) { 164 fflush(replogfile); 165 fclose(replogfile); 166 replogfile=0; 167 } 168 if (firstbasestr != basestr) { 169 if (basestr) free(basestr); 170 } 171 return f; 172} 173 174int makeinit(fa *f, int anchor) 175{ 176 int i, k; 177 178 f->curstat = 2; 179 f->out[2] = 0; 180 f->reset = 0; 181 k = *(f->re[0].lfollow); 182 xfree(f->posns[2]); 183 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 184 overflo("out of space in makeinit"); 185 for (i=0; i <= k; i++) { 186 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 187 } 188 if ((f->posns[2])[1] == f->accept) 189 f->out[2] = 1; 190 for (i=0; i < NCHARS; i++) 191 f->gototab[2][i] = 0; 192 f->curstat = cgoto(f, 2, HAT); 193 if (anchor) { 194 *f->posns[2] = k-1; /* leave out position 0 */ 195 for (i=0; i < k; i++) { 196 (f->posns[0])[i] = (f->posns[2])[i]; 197 } 198 199 f->out[0] = f->out[2]; 200 if (f->curstat != 2) 201 --(*f->posns[f->curstat]); 202 } 203 return f->curstat; 204} 205 206void penter(Node *p) /* set up parent pointers and leaf indices */ 207{ 208 switch (type(p)) { 209 ELEAF 210 LEAF 211 info(p) = poscnt; 212 poscnt++; 213 break; 214 UNARY 215 penter(left(p)); 216 parent(left(p)) = p; 217 break; 218 case CAT: 219 case OR: 220 penter(left(p)); 221 penter(right(p)); 222 parent(left(p)) = p; 223 parent(right(p)) = p; 224 break; 225 default: /* can't happen */ 226 FATAL("can't happen: unknown type %d in penter", type(p)); 227 break; 228 } 229} 230 231void freetr(Node *p) /* free parse tree */ 232{ 233 switch (type(p)) { 234 ELEAF 235 LEAF 236 xfree(p); 237 break; 238 UNARY 239 freetr(left(p)); 240 xfree(p); 241 break; 242 case CAT: 243 case OR: 244 freetr(left(p)); 245 freetr(right(p)); 246 xfree(p); 247 break; 248 default: /* can't happen */ 249 FATAL("can't happen: unknown type %d in freetr", type(p)); 250 break; 251 } 252} 253 254/* in the parsing of regular expressions, metacharacters like . have */ 255/* to be seen literally; \056 is not a metacharacter. */ 256 257int hexstr(char **pp) /* find and eval hex string at pp, return new p */ 258{ /* only pick up one 8-bit byte (2 chars) */ 259 uschar *p; 260 int n = 0; 261 int i; 262 263 for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) { 264 if (isdigit(*p)) 265 n = 16 * n + *p - '0'; 266 else if (*p >= 'a' && *p <= 'f') 267 n = 16 * n + *p - 'a' + 10; 268 else if (*p >= 'A' && *p <= 'F') 269 n = 16 * n + *p - 'A' + 10; 270 } 271 *pp = (char *) p; 272 return n; 273} 274 275#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 276 277int quoted(char **pp) /* pick up next thing after a \\ */ 278 /* and increment *pp */ 279{ 280 char *p = *pp; 281 int c; 282 283 if ((c = *p++) == 't') 284 c = '\t'; 285 else if (c == 'n') 286 c = '\n'; 287 else if (c == 'f') 288 c = '\f'; 289 else if (c == 'r') 290 c = '\r'; 291 else if (c == 'b') 292 c = '\b'; 293 else if (c == '\\') 294 c = '\\'; 295 else if (c == 'x') { /* hexadecimal goo follows */ 296 c = hexstr(&p); /* this adds a null if number is invalid */ 297 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 298 int n = c - '0'; 299 if (isoctdigit(*p)) { 300 n = 8 * n + *p++ - '0'; 301 if (isoctdigit(*p)) 302 n = 8 * n + *p++ - '0'; 303 } 304 c = n; 305 } /* else */ 306 /* c = c; */ 307 *pp = p; 308 return c; 309} 310 311char *cclenter(const char *argp) /* add a character class */ 312{ 313 int i, c, c2; 314 uschar *p = (uschar *) argp; 315 uschar *op, *bp; 316 static uschar *buf = 0; 317 static int bufsz = 100; 318 319 op = p; 320 if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 321 FATAL("out of space for character class [%.10s...] 1", p); 322 bp = buf; 323 for (i = 0; (c = *p++) != 0; ) { 324 if (c == '\\') { 325 c = quoted((char **) &p); 326 } else if (c == '-' && i > 0 && bp[-1] != 0) { 327 if (*p != 0) { 328 c = bp[-1]; 329 c2 = *p++; 330 if (c2 == '\\') 331 c2 = quoted((char **) &p); 332 if (c > c2) { /* empty; ignore */ 333 bp--; 334 i--; 335 continue; 336 } 337 while (c < c2) { 338 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 339 FATAL("out of space for character class [%.10s...] 2", p); 340 *bp++ = ++c; 341 i++; 342 } 343 continue; 344 } 345 } 346 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 347 FATAL("out of space for character class [%.10s...] 3", p); 348 *bp++ = c; 349 i++; 350 } 351 *bp = 0; 352 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); 353 xfree(op); 354 return (char *) tostring((char *) buf); 355} 356 357void overflo(const char *s) 358{ 359 FATAL("regular expression too big: %.30s...", s); 360} 361 362void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 363{ 364 int i; 365 int *p; 366 367 switch (type(v)) { 368 ELEAF 369 LEAF 370 f->re[info(v)].ltype = type(v); 371 f->re[info(v)].lval.np = right(v); 372 while (f->accept >= maxsetvec) { /* guessing here! */ 373 maxsetvec *= 4; 374 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 375 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 376 if (setvec == 0 || tmpset == 0) 377 overflo("out of space in cfoll()"); 378 } 379 for (i = 0; i <= f->accept; i++) 380 setvec[i] = 0; 381 setcnt = 0; 382 follow(v); /* computes setvec and setcnt */ 383 if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 384 overflo("out of space building follow set"); 385 f->re[info(v)].lfollow = p; 386 *p = setcnt; 387 for (i = f->accept; i >= 0; i--) 388 if (setvec[i] == 1) 389 *++p = i; 390 break; 391 UNARY 392 cfoll(f,left(v)); 393 break; 394 case CAT: 395 case OR: 396 cfoll(f,left(v)); 397 cfoll(f,right(v)); 398 break; 399 default: /* can't happen */ 400 FATAL("can't happen: unknown type %d in cfoll", type(v)); 401 } 402} 403 404int first(Node *p) /* collects initially active leaves of p into setvec */ 405 /* returns 0 if p matches empty string */ 406{ 407 int b, lp; 408 409 switch (type(p)) { 410 ELEAF 411 LEAF 412 lp = info(p); /* look for high-water mark of subscripts */ 413 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 414 maxsetvec *= 4; 415 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 416 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 417 if (setvec == 0 || tmpset == 0) 418 overflo("out of space in first()"); 419 } 420 if (type(p) == EMPTYRE) { 421 setvec[lp] = 0; 422 return(0); 423 } 424 if (setvec[lp] != 1) { 425 setvec[lp] = 1; 426 setcnt++; 427 } 428 if (type(p) == CCL && (*(char *) right(p)) == '\0') 429 return(0); /* empty CCL */ 430 else return(1); 431 case PLUS: 432 if (first(left(p)) == 0) return(0); 433 return(1); 434 case STAR: 435 case QUEST: 436 first(left(p)); 437 return(0); 438 case CAT: 439 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 440 return(1); 441 case OR: 442 b = first(right(p)); 443 if (first(left(p)) == 0 || b == 0) return(0); 444 return(1); 445 } 446 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 447 return(-1); 448} 449 450void follow(Node *v) /* collects leaves that can follow v into setvec */ 451{ 452 Node *p; 453 454 if (type(v) == FINAL) 455 return; 456 p = parent(v); 457 switch (type(p)) { 458 case STAR: 459 case PLUS: 460 first(v); 461 follow(p); 462 return; 463 464 case OR: 465 case QUEST: 466 follow(p); 467 return; 468 469 case CAT: 470 if (v == left(p)) { /* v is left child of p */ 471 if (first(right(p)) == 0) { 472 follow(p); 473 return; 474 } 475 } else /* v is right child */ 476 follow(p); 477 return; 478 } 479} 480 481int member(int c, const char *sarg) /* is c in s? */ 482{ 483 uschar *s = (uschar *) sarg; 484 485 while (*s) 486 if (c == *s++) 487 return(1); 488 return(0); 489} 490 491int match(fa *f, const char *p0) /* shortest match ? */ 492{ 493 int s, ns; 494 uschar *p = (uschar *) p0; 495 496 s = f->reset ? makeinit(f,0) : f->initstat; 497 if (f->out[s]) 498 return(1); 499 do { 500 /* assert(*p < NCHARS); */ 501 if ((ns = f->gototab[s][*p]) != 0) 502 s = ns; 503 else 504 s = cgoto(f, s, *p); 505 if (f->out[s]) 506 return(1); 507 } while (*p++ != 0); 508 return(0); 509} 510 511int pmatch(fa *f, const char *p0) /* longest match, for sub */ 512{ 513 int s, ns; 514 uschar *p = (uschar *) p0; 515 uschar *q; 516 int i, k; 517 518 /* s = f->reset ? makeinit(f,1) : f->initstat; */ 519 if (f->reset) { 520 f->initstat = s = makeinit(f,1); 521 } else { 522 s = f->initstat; 523 } 524 patbeg = (char *) p; 525 patlen = -1; 526 do { 527 q = p; 528 do { 529 if (f->out[s]) /* final state */ 530 patlen = q-p; 531 /* assert(*q < NCHARS); */ 532 if ((ns = f->gototab[s][*q]) != 0) 533 s = ns; 534 else 535 s = cgoto(f, s, *q); 536 if (s == 1) { /* no transition */ 537 if (patlen >= 0) { 538 patbeg = (char *) p; 539 return(1); 540 } 541 else 542 goto nextin; /* no match */ 543 } 544 } while (*q++ != 0); 545 if (f->out[s]) 546 patlen = q-p-1; /* don't count $ */ 547 if (patlen >= 0) { 548 patbeg = (char *) p; 549 return(1); 550 } 551 nextin: 552 s = 2; 553 if (f->reset) { 554 for (i = 2; i <= f->curstat; i++) 555 xfree(f->posns[i]); 556 k = *f->posns[0]; 557 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 558 overflo("out of space in pmatch"); 559 for (i = 0; i <= k; i++) 560 (f->posns[2])[i] = (f->posns[0])[i]; 561 f->initstat = f->curstat = 2; 562 f->out[2] = f->out[0]; 563 for (i = 0; i < NCHARS; i++) 564 f->gototab[2][i] = 0; 565 } 566 } while (*p++ != 0); 567 return (0); 568} 569 570int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 571{ 572 int s, ns; 573 uschar *p = (uschar *) p0; 574 uschar *q; 575 int i, k; 576 577 /* s = f->reset ? makeinit(f,1) : f->initstat; */ 578 if (f->reset) { 579 f->initstat = s = makeinit(f,1); 580 } else { 581 s = f->initstat; 582 } 583 patlen = -1; 584 while (*p) { 585 q = p; 586 do { 587 if (f->out[s]) /* final state */ 588 patlen = q-p; 589 /* assert(*q < NCHARS); */ 590 if ((ns = f->gototab[s][*q]) != 0) 591 s = ns; 592 else 593 s = cgoto(f, s, *q); 594 if (s == 1) { /* no transition */ 595 if (patlen > 0) { 596 patbeg = (char *) p; 597 return(1); 598 } else 599 goto nnextin; /* no nonempty match */ 600 } 601 } while (*q++ != 0); 602 if (f->out[s]) 603 patlen = q-p-1; /* don't count $ */ 604 if (patlen > 0 ) { 605 patbeg = (char *) p; 606 return(1); 607 } 608 nnextin: 609 s = 2; 610 if (f->reset) { 611 for (i = 2; i <= f->curstat; i++) 612 xfree(f->posns[i]); 613 k = *f->posns[0]; 614 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 615 overflo("out of state space"); 616 for (i = 0; i <= k; i++) 617 (f->posns[2])[i] = (f->posns[0])[i]; 618 f->initstat = f->curstat = 2; 619 f->out[2] = f->out[0]; 620 for (i = 0; i < NCHARS; i++) 621 f->gototab[2][i] = 0; 622 } 623 p++; 624 } 625 return (0); 626} 627 628Node *reparse(const char *p) /* parses regular expression pointed to by p */ 629{ /* uses relex() to scan regular expression */ 630 Node *np; 631 632 dprintf( ("reparse <%s>\n", p) ); 633 lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */ 634 rtok = relex(); 635 /* GNU compatibility: an empty regexp matches anything */ 636 if (rtok == '\0') { 637 /* FATAL("empty regular expression"); previous */ 638 return(op2(EMPTYRE, NIL, NIL)); 639 } 640 np = regexp(); 641 if (rtok != '\0') 642 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 643 return(np); 644} 645 646Node *regexp(void) /* top-level parse of reg expr */ 647{ 648 return (alt(concat(primary()))); 649} 650 651Node *primary(void) 652{ 653 Node *np; 654 int savelastatom; 655 656 switch (rtok) { 657 case CHAR: 658 lastatom = starttok; 659 np = op2(CHAR, NIL, itonp(rlxval)); 660 rtok = relex(); 661 return (unary(np)); 662 case ALL: 663 rtok = relex(); 664 return (unary(op2(ALL, NIL, NIL))); 665 case EMPTYRE: 666 if (replogfile) { 667 fprintf(replogfile, 668 "returned EMPTYRE from primary\n"); 669 fflush(replogfile); 670 } 671 rtok = relex(); 672 673 return (unary(op2(EMPTYRE, NIL, NIL))); 674 case DOT: 675 lastatom = starttok; 676 rtok = relex(); 677 return (unary(op2(DOT, NIL, NIL))); 678 case CCL: 679 lastatom = starttok; 680 np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); 681 rtok = relex(); 682 return (unary(np)); 683 case NCCL: 684 lastatom = starttok; 685 np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); 686 rtok = relex(); 687 return (unary(np)); 688 case '^': 689 rtok = relex(); 690 return (unary(op2(CHAR, NIL, itonp(HAT)))); 691 case '$': 692 rtok = relex(); 693 return (unary(op2(CHAR, NIL, NIL))); 694 case '(': 695 lastatom = starttok; 696 savelastatom = (char *)starttok-basestr; /* Retain over recursion */ 697 rtok = relex(); 698 if (rtok == ')') { /* special pleading for () */ 699 rtok = relex(); 700 return unary(op2(CCL, NIL, (Node *) tostring(""))); 701 } 702 np = regexp(); 703 if (rtok == ')') { 704 lastatom = basestr+savelastatom; /* Restore */ 705 rtok = relex(); 706 return (unary(np)); 707 } 708 else 709 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 710 default: 711 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 712 } 713 return 0; /*NOTREACHED*/ 714} 715 716Node *concat(Node *np) 717{ 718 switch (rtok) { 719 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 720 return (concat(op2(CAT, np, primary()))); 721 case EMPTYRE: 722 if (replogfile) { 723 fprintf(replogfile, 724 "returned EMPTYRE to concat\n"); 725 fflush(replogfile); 726 } 727 rtok = relex(); 728 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 729 primary()))); 730 } 731 return (np); 732} 733 734Node *alt(Node *np) 735{ 736 if (rtok == OR) { 737 rtok = relex(); 738 return (alt(op2(OR, np, concat(primary())))); 739 } 740 return (np); 741} 742 743Node *unary(Node *np) 744{ 745 switch (rtok) { 746 case STAR: 747 rtok = relex(); 748 return (unary(op2(STAR, np, NIL))); 749 case PLUS: 750 rtok = relex(); 751 return (unary(op2(PLUS, np, NIL))); 752 case QUEST: 753 rtok = relex(); 754 return (unary(op2(QUEST, np, NIL))); 755 default: 756 return (np); 757 } 758} 759 760/* 761 * Character class definitions conformant to the POSIX locale as 762 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 763 * and operating character sets are both ASCII (ISO646) or supersets 764 * thereof. 765 * 766 * Note that to avoid overflowing the temporary buffer used in 767 * relex(), the expanded character class (prior to range expansion) 768 * must be less than twice the size of their full name. 769 */ 770 771/* Because isblank doesn't show up in any of the header files on any 772 * system i use, it's defined here. if some other locale has a richer 773 * definition of "blank", define HAS_ISBLANK and provide your own 774 * version. 775 * the parentheses here are an attempt to find a path through the maze 776 * of macro definition and/or function and/or version provided. thanks 777 * to nelson beebe for the suggestion; let's see if it works everywhere. 778 */ 779 780#if defined(__APPLE__) 781#define HAS_ISBLANK 782#endif 783#ifndef HAS_ISBLANK 784 785int (isblank)(int c) 786{ 787 return c==' ' || c=='\t'; 788} 789 790#endif 791 792struct charclass { 793 const char *cc_name; 794 int cc_namelen; 795 int (*cc_func)(int); 796} charclasses[] = { 797 { "alnum", 5, isalnum }, 798 { "alpha", 5, isalpha }, 799 { "blank", 5, isblank }, 800 { "cntrl", 5, iscntrl }, 801 { "digit", 5, isdigit }, 802 { "graph", 5, isgraph }, 803 { "lower", 5, islower }, 804 { "print", 5, isprint }, 805 { "punct", 5, ispunct }, 806 { "space", 5, isspace }, 807 { "upper", 5, isupper }, 808 { "xdigit", 6, isxdigit }, 809 { NULL, 0, NULL }, 810}; 811 812#define REPEAT_SIMPLE 0 813#define REPEAT_PLUS_APPENDED 1 814#define REPEAT_WITH_Q 2 815#define REPEAT_ZERO 3 816 817int replace_repeat(uschar * reptok, int reptoklen, uschar * atom, int atomlen, 818 int firstnum, int secondnum, int special_case) 819{ 820 int i, j; 821 uschar *buf = 0; 822 int ret = 1; 823 int init_q = (firstnum==0); /* first added char will be ? */ 824 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 825 int prefix_length = (char *) reptok-basestr; /* prefix includes first rep */ 826 int suffix_length = strlen(reptok) - reptoklen; /* string after rep specifier */ 827 int size = prefix_length + suffix_length; 828 829 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 830 size += atomlen*(firstnum-1); 831 } 832 833 /* Adjust size of buffer for special cases */ 834 if (special_case == REPEAT_PLUS_APPENDED) { 835 size++; /* for the final + */ 836 } else if (special_case == REPEAT_WITH_Q) { 837 size += init_q + (atomlen+1)* n_q_reps; 838 } else if (special_case == REPEAT_ZERO) { 839 size += 2; /* just a null ERE: () */ 840 } 841 if ((buf = (uschar *) malloc(size+1)) == NULL) 842 FATAL("out of space in reg expr %.10s..", lastre); 843 if (replogfile) { 844 fprintf(replogfile, "re before: len=%d,%s\n" 845 " : init_q=%d,n_q_reps=%d\n", 846 strlen(basestr),basestr, 847 init_q,n_q_reps); 848 fprintf(replogfile, "re prefix_length=%d,atomlen=%d\n", 849 prefix_length,atomlen); 850/* 851 fprintf(replogfile, " new buf size: %d, atom=%s, atomlen=%d\n", 852 size, atom, atomlen); 853*/ 854 fflush(replogfile); 855 } 856 memcpy(buf, basestr, prefix_length); /* copy prefix */ 857 j = prefix_length; 858 if (special_case == REPEAT_ZERO) { 859 j -= atomlen; 860 buf[j++] = '('; 861 buf[j++] = ')'; 862 } 863 for (i=1; i < firstnum; i++) { /* copy x reps */ 864 memcpy(&buf[j], atom, atomlen); 865 j += atomlen; 866 } 867 if (special_case == REPEAT_PLUS_APPENDED) { 868 buf[j++] = '+'; 869 } else if (special_case == REPEAT_WITH_Q) { 870 if (init_q) buf[j++] = '?'; 871 for (i=0; i < n_q_reps; i++) { /* copy x? reps */ 872 memcpy(&buf[j], atom, atomlen); 873 j += atomlen; 874 buf[j++] = '?'; 875 } 876 } 877 memcpy(&buf[j], reptok+reptoklen, suffix_length); 878 if (special_case == REPEAT_ZERO) { 879 buf[j+suffix_length] = '\0'; 880 } else { 881 buf[size] = '\0'; 882 } 883 if (replogfile) { 884 fprintf(replogfile, "re after : len=%d,%s\n",strlen(buf),buf); 885 fflush(replogfile); 886 } 887 /* free old basestr */ 888 if (firstbasestr != basestr) { 889 if (basestr) free(basestr); 890 } 891 basestr = (char *)buf; 892 prestr = buf + prefix_length; 893 if (special_case == REPEAT_ZERO) { 894 prestr -= atomlen; 895 ret++; 896 } 897 return ret; 898} 899 900int repeat(uschar * reptok, int reptoklen, uschar * atom, int atomlen, 901 int firstnum, int secondnum) 902{ 903 /* 904 In general, the repetition specifier or "bound" is replaced here 905 by an equivalent ERE string, repeating the immediately previous atom 906 and appending ? and + as needed. Note that the first copy of the 907 atom is left in place, except in the special_case of a zero-repeat 908 (i.e., {0}). 909 */ 910 int i, j; 911 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 912 if (firstnum < 2) { 913 /* 0 or 1: should be handled before you get here */ 914 if (replogfile) { 915 fprintf(replogfile, 916 "{%d, %d}, shouldn't be here\n", 917 firstnum, secondnum); 918 fflush(replogfile); 919 } 920 } else { 921 return replace_repeat(reptok, reptoklen, atom, atomlen, 922 firstnum, secondnum, REPEAT_PLUS_APPENDED); 923 } 924 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 925 if (firstnum == 0) { /* {0} or {0,0} */ 926 /* This case is unusual because the resulting 927 replacement string might actually be SMALLER than 928 the original ERE */ 929 return replace_repeat(reptok, reptoklen, atom, atomlen, 930 firstnum, secondnum, REPEAT_ZERO); 931 } else { /* (firstnum >= 1) */ 932 return replace_repeat(reptok, reptoklen, atom, atomlen, 933 firstnum, secondnum, REPEAT_SIMPLE); 934 } 935 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 936 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 937 return replace_repeat(reptok, reptoklen, atom, atomlen, 938 firstnum, secondnum, REPEAT_WITH_Q); 939 } else { /* Error - shouldn't be here (n>m) */ 940 if (replogfile) { 941 fprintf(replogfile, 942 "illegal ERE {%d,%d} shouldn't be here!\n", 943 firstnum,secondnum); 944 fflush(replogfile); 945 } 946 } 947 return 0; 948} 949 950int relex(void) /* lexical analyzer for reparse */ 951{ 952 int c, n; 953 int cflag; 954 static uschar *buf = 0; 955 static int bufsz = 100; 956 uschar *bp; 957 struct charclass *cc; 958 int i; 959 int num, m, commafound, digitfound; 960 uschar *startreptok; 961 962rescan: 963 starttok = prestr; 964 965 switch (c = *prestr++) { 966 case '|': return OR; 967 case '*': return STAR; 968 case '+': return PLUS; 969 case '?': return QUEST; 970 case '.': return DOT; 971 case '\0': prestr--; return '\0'; 972 case '^': 973 case '$': 974 case '(': 975 case ')': 976 return c; 977 case '\\': 978 rlxval = quoted((char **) &prestr); 979 return CHAR; 980 default: 981 rlxval = c; 982 return CHAR; 983 case '[': 984 if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 985 FATAL("out of space in reg expr %.10s..", lastre); 986 bp = buf; 987 if (*prestr == '^') { 988 cflag = 1; 989 prestr++; 990 } 991 else 992 cflag = 0; 993 n = 2 * strlen((const char *) prestr)+1; 994 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 995 FATAL("out of space for reg expr %.10s...", lastre); 996 for (; ; ) { 997 if ((c = *prestr++) == '\\') { 998 *bp++ = '\\'; 999 if ((c = *prestr++) == '\0') 1000 FATAL("nonterminated character class %.20s...", lastre); 1001 *bp++ = c; 1002 /* } else if (c == '\n') { */ 1003 /* FATAL("newline in character class %.20s...", lastre); */ 1004 } else if (c == '[' && *prestr == ':') { 1005 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1006 for (cc = charclasses; cc->cc_name; cc++) 1007 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1008 break; 1009 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1010 prestr[2 + cc->cc_namelen] == ']') { 1011 prestr += cc->cc_namelen + 3; 1012 for (i = 0; i < NCHARS; i++) { 1013 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 1014 FATAL("out of space for reg expr %.10s...", lastre); 1015 if (cc->cc_func(i)) { 1016 *bp++ = i; 1017 n++; 1018 } 1019 } 1020 } else 1021 *bp++ = c; 1022 } else if (Unix2003_compat && c == '[' 1023 && *prestr == '.') { 1024 char collate_char; 1025 prestr++; 1026 collate_char = *prestr++; 1027 if (*prestr == '.' && prestr[1] == ']') { 1028 prestr += 2; 1029 /* Found it: map via locale TBD: for 1030 now, simply return this char. This 1031 is sufficient to pass conformance 1032 test awk.ex 156 1033 */ 1034 if (*prestr == ']') { 1035 prestr++; 1036 rlxval = collate_char; 1037 if (replogfile) { 1038 fprintf(replogfile, 1039 "[..] collate char=%c\n", 1040 collate_char); 1041 fflush(replogfile); 1042 } 1043 return CHAR; 1044 } 1045 } 1046 } else if (Unix2003_compat && c == '[' 1047 && *prestr == '=') { 1048 char equiv_char; 1049 prestr++; 1050 equiv_char = *prestr++; 1051 if (*prestr == '=' && prestr[1] == ']') { 1052 prestr += 2; 1053 /* Found it: map via locale TBD: for now 1054 simply return this char. This is 1055 sufficient to pass conformance test 1056 awk.ex 156 1057 */ 1058 if (*prestr == ']') { 1059 prestr++; 1060 rlxval = equiv_char; 1061 if (replogfile) { 1062 fprintf(replogfile, 1063 "[==] equiv char=%c\n", 1064 equiv_char); 1065 fflush(replogfile); 1066 } 1067 return CHAR; 1068 } 1069 } 1070 } else if (c == '\0') { 1071 FATAL("nonterminated character class %.20s", lastre); 1072 } else if (bp == buf) { /* 1st char is special */ 1073 *bp++ = c; 1074 } else if (c == ']') { 1075 *bp++ = 0; 1076 rlxstr = (uschar *) tostring((char *) buf); 1077 if (replogfile) { 1078 fprintf(replogfile, 1079 "detecting []: cflag=%d, len=%d,%s\n", 1080 cflag,strlen(rlxstr),rlxstr); 1081 fflush(replogfile); 1082 } 1083 if (cflag == 0) 1084 return CCL; 1085 else 1086 return NCCL; 1087 } else 1088 *bp++ = c; 1089 } 1090 break; 1091 case '{': 1092 if (Unix2003_compat && isdigit(*(prestr))) { 1093 num = 0; /* Process as a repetition */ 1094 n = -1; m = -1; 1095 commafound = 0; 1096 digitfound = 0; 1097 startreptok = prestr-1; 1098 /* Remember start of previous atom here ? */ 1099 } else { /* just a { char, not a repetition */ 1100 rlxval = c; 1101 return CHAR; 1102 } 1103 for (; ; ) { 1104 if ((c = *prestr++) == '}') { 1105 if (commafound) { 1106 if (digitfound) { /* {n,m} */ 1107 m = num; 1108 if (m<n) 1109 FATAL("illegal repetition expression: class %.20s", 1110 lastre); 1111 if ((n==0) && (m==1)) { 1112 return QUEST; 1113 } 1114 } else { /* {n,} */ 1115 if (n==0) return STAR; 1116 if (n==1) return PLUS; 1117 } 1118 } else { 1119 if (digitfound) { /* {n} same as {n,n} */ 1120 n = num; 1121 m = num; 1122 } else { /* {} */ 1123 FATAL("illegal repetition expression: class %.20s", 1124 lastre); 1125 } 1126 } 1127 if (repeat(starttok, prestr-starttok, lastatom, 1128 startreptok - lastatom, n, m) > 0) { 1129 if ((n==0) && (m==0)) { 1130 return EMPTYRE; 1131 } 1132 /* must rescan input for next token */ 1133 goto rescan; 1134 } 1135 /* Failed to replace: eat up {...} characters 1136 and treat like just PLUS */ 1137 return PLUS; 1138 } else if (c == '\0') { 1139 FATAL("nonterminated character class %.20s", 1140 lastre); 1141 } else if (isdigit(c)) { 1142 num = 10 * num + c - '0'; 1143 digitfound = 1; 1144 } else if (c == ',') { 1145 if (commafound) 1146 FATAL("illegal repetition expression: class %.20s", 1147 lastre); 1148 /* looking for {n,} or {n,m} */ 1149 commafound = 1; 1150 n = num; 1151 digitfound = 0; /* reset */ 1152 num = 0; 1153 } else { 1154 FATAL("illegal repetition expression: class %.20s", 1155 lastre); 1156 } 1157 } 1158 break; 1159 } 1160} 1161 1162int cgoto(fa *f, int s, int c) 1163{ 1164 int i, j, k; 1165 int *p, *q; 1166 1167 assert(c == HAT || c < NCHARS); 1168 while (f->accept >= maxsetvec) { /* guessing here! */ 1169 maxsetvec *= 4; 1170 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 1171 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 1172 if (setvec == 0 || tmpset == 0) 1173 overflo("out of space in cgoto()"); 1174 } 1175 for (i = 0; i <= f->accept; i++) 1176 setvec[i] = 0; 1177 setcnt = 0; 1178 /* compute positions of gototab[s,c] into setvec */ 1179 p = f->posns[s]; 1180 for (i = 1; i <= *p; i++) { 1181 if ((k = f->re[p[i]].ltype) != FINAL) { 1182 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1183 || (k == DOT && c != 0 && c != HAT) 1184 || (k == ALL && c != 0) 1185 || (k == EMPTYRE && c != 0) 1186 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 1187 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 1188 q = f->re[p[i]].lfollow; 1189 for (j = 1; j <= *q; j++) { 1190 if (q[j] >= maxsetvec) { 1191 maxsetvec *= 4; 1192 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 1193 tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int)); 1194 if (setvec == 0 || tmpset == 0) 1195 overflo("cgoto overflow"); 1196 } 1197 if (setvec[q[j]] == 0) { 1198 setcnt++; 1199 setvec[q[j]] = 1; 1200 } 1201 } 1202 } 1203 } 1204 } 1205 /* determine if setvec is a previous state */ 1206 tmpset[0] = setcnt; 1207 j = 1; 1208 for (i = f->accept; i >= 0; i--) 1209 if (setvec[i]) { 1210 tmpset[j++] = i; 1211 } 1212 /* tmpset == previous state? */ 1213 for (i = 1; i <= f->curstat; i++) { 1214 p = f->posns[i]; 1215 if ((k = tmpset[0]) != p[0]) 1216 goto different; 1217 for (j = 1; j <= k; j++) 1218 if (tmpset[j] != p[j]) 1219 goto different; 1220 /* setvec is state i */ 1221 f->gototab[s][c] = i; 1222 return i; 1223 different:; 1224 } 1225 1226 /* add tmpset to current set of states */ 1227 if (f->curstat >= NSTATES-1) { 1228 f->curstat = 2; 1229 f->reset = 1; 1230 for (i = 2; i < NSTATES; i++) 1231 xfree(f->posns[i]); 1232 } else 1233 ++(f->curstat); 1234 for (i = 0; i < NCHARS; i++) 1235 f->gototab[f->curstat][i] = 0; 1236 xfree(f->posns[f->curstat]); 1237 if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 1238 overflo("out of space in cgoto"); 1239 1240 f->posns[f->curstat] = p; 1241 f->gototab[s][c] = f->curstat; 1242 for (i = 0; i <= setcnt; i++) 1243 p[i] = tmpset[i]; 1244 if (setvec[f->accept]) 1245 f->out[f->curstat] = 1; 1246 else 1247 f->out[f->curstat] = 0; 1248 return f->curstat; 1249} 1250 1251 1252void freefa(fa *f) /* free a finite automaton */ 1253{ 1254 int i; 1255 1256 if (f == NULL) 1257 return; 1258 for (i = 0; i <= f->curstat; i++) 1259 xfree(f->posns[i]); 1260 for (i = 0; i <= f->accept; i++) { 1261 xfree(f->re[i].lfollow); 1262 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1263 xfree((f->re[i].lval.np)); 1264 } 1265 xfree(f->restr); 1266 xfree(f); 1267} 1268