1/* 2 * lexical analyzer 3 * This file is #included by regcomp.c. 4 * 5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. 6 * 7 * Development of this software was funded, in part, by Cray Research Inc., 8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 9 * Corporation, none of whom are responsible for the results. The author 10 * thanks all of them. 11 * 12 * Redistribution and use in source and binary forms -- with or without 13 * modification -- are permitted for any purpose, provided that 14 * redistributions in source form retain this entire copyright notice and 15 * indicate the origin and nature of any modifications. 16 * 17 * I'd appreciate being given credit for this package in the documentation of 18 * software which uses it, but that is not a requirement. 19 * 20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* scanning macros (know about v) */ 33#define ATEOS() (v->now >= v->stop) 34#define HAVE(n) (v->stop - v->now >= (n)) 35#define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) 36#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) 37#define NEXT3(a,b,c) \ 38 (HAVE(3) && *v->now == CHR(a) && \ 39 *(v->now+1) == CHR(b) && \ 40 *(v->now+2) == CHR(c)) 41#define SET(c) (v->nexttype = (c)) 42#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) 43#define RET(c) return (SET(c), 1) 44#define RETV(c, n) return (SETV(c, n), 1) 45#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ 46#define LASTTYPE(t) (v->lasttype == (t)) 47 48/* lexical contexts */ 49#define L_ERE 1 /* mainline ERE/ARE */ 50#define L_BRE 2 /* mainline BRE */ 51#define L_Q 3 /* REG_QUOTE */ 52#define L_EBND 4 /* ERE/ARE bound */ 53#define L_BBND 5 /* BRE bound */ 54#define L_BRACK 6 /* brackets */ 55#define L_CEL 7 /* collating element */ 56#define L_ECL 8 /* equivalence class */ 57#define L_CCL 9 /* character class */ 58#define INTOCON(c) (v->lexcon = (c)) 59#define INCON(con) (v->lexcon == (con)) 60 61/* construct pointer past end of chr array */ 62#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) 63 64/* 65 - lexstart - set up lexical stuff, scan leading options 66 ^ static VOID lexstart(struct vars *); 67 */ 68static void 69lexstart( 70 struct vars *v) 71{ 72 prefixes(v); /* may turn on new type bits etc. */ 73 NOERR(); 74 75 if (v->cflags®_QUOTE) { 76 assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))); 77 INTOCON(L_Q); 78 } else if (v->cflags®_EXTENDED) { 79 assert(!(v->cflags®_QUOTE)); 80 INTOCON(L_ERE); 81 } else { 82 assert(!(v->cflags&(REG_QUOTE|REG_ADVF))); 83 INTOCON(L_BRE); 84 } 85 86 v->nexttype = EMPTY; /* remember we were at the start */ 87 next(v); /* set up the first token */ 88} 89 90/* 91 - prefixes - implement various special prefixes 92 ^ static VOID prefixes(struct vars *); 93 */ 94static void 95prefixes( 96 struct vars *v) 97{ 98 /* 99 * Literal string doesn't get any of this stuff. 100 */ 101 102 if (v->cflags®_QUOTE) { 103 return; 104 } 105 106 /* 107 * Initial "***" gets special things. 108 */ 109 110 if (HAVE(4) && NEXT3('*', '*', '*')) { 111 switch (*(v->now + 3)) { 112 case CHR('?'): /* "***?" error, msg shows version */ 113 ERR(REG_BADPAT); 114 return; /* proceed no further */ 115 break; 116 case CHR('='): /* "***=" shifts to literal string */ 117 NOTE(REG_UNONPOSIX); 118 v->cflags |= REG_QUOTE; 119 v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE); 120 v->now += 4; 121 return; /* and there can be no more prefixes */ 122 break; 123 case CHR(':'): /* "***:" shifts to AREs */ 124 NOTE(REG_UNONPOSIX); 125 v->cflags |= REG_ADVANCED; 126 v->now += 4; 127 break; 128 default: /* otherwise *** is just an error */ 129 ERR(REG_BADRPT); 130 return; 131 break; 132 } 133 } 134 135 /* 136 * BREs and EREs don't get embedded options. 137 */ 138 139 if ((v->cflags®_ADVANCED) != REG_ADVANCED) { 140 return; 141 } 142 143 /* 144 * Embedded options (AREs only). 145 */ 146 147 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) { 148 NOTE(REG_UNONPOSIX); 149 v->now += 2; 150 for (; !ATEOS() && iscalpha(*v->now); v->now++) { 151 switch (*v->now) { 152 case CHR('b'): /* BREs (but why???) */ 153 v->cflags &= ~(REG_ADVANCED|REG_QUOTE); 154 break; 155 case CHR('c'): /* case sensitive */ 156 v->cflags &= ~REG_ICASE; 157 break; 158 case CHR('e'): /* plain EREs */ 159 v->cflags |= REG_EXTENDED; 160 v->cflags &= ~(REG_ADVF|REG_QUOTE); 161 break; 162 case CHR('i'): /* case insensitive */ 163 v->cflags |= REG_ICASE; 164 break; 165 case CHR('m'): /* Perloid synonym for n */ 166 case CHR('n'): /* \n affects ^ $ . [^ */ 167 v->cflags |= REG_NEWLINE; 168 break; 169 case CHR('p'): /* ~Perl, \n affects . [^ */ 170 v->cflags |= REG_NLSTOP; 171 v->cflags &= ~REG_NLANCH; 172 break; 173 case CHR('q'): /* literal string */ 174 v->cflags |= REG_QUOTE; 175 v->cflags &= ~REG_ADVANCED; 176 break; 177 case CHR('s'): /* single line, \n ordinary */ 178 v->cflags &= ~REG_NEWLINE; 179 break; 180 case CHR('t'): /* tight syntax */ 181 v->cflags &= ~REG_EXPANDED; 182 break; 183 case CHR('w'): /* weird, \n affects ^ $ only */ 184 v->cflags &= ~REG_NLSTOP; 185 v->cflags |= REG_NLANCH; 186 break; 187 case CHR('x'): /* expanded syntax */ 188 v->cflags |= REG_EXPANDED; 189 break; 190 default: 191 ERR(REG_BADOPT); 192 return; 193 } 194 } 195 if (!NEXT1(')')) { 196 ERR(REG_BADOPT); 197 return; 198 } 199 v->now++; 200 if (v->cflags®_QUOTE) { 201 v->cflags &= ~(REG_EXPANDED|REG_NEWLINE); 202 } 203 } 204} 205 206/* 207 - lexnest - "call a subroutine", interpolating string at the lexical level 208 * Note, this is not a very general facility. There are a number of 209 * implicit assumptions about what sorts of strings can be subroutines. 210 ^ static VOID lexnest(struct vars *, const chr *, const chr *); 211 */ 212static void 213lexnest( 214 struct vars *v, 215 const chr *beginp, /* start of interpolation */ 216 const chr *endp) /* one past end of interpolation */ 217{ 218 assert(v->savenow == NULL); /* only one level of nesting */ 219 v->savenow = v->now; 220 v->savestop = v->stop; 221 v->now = beginp; 222 v->stop = endp; 223} 224 225/* 226 * string constants to interpolate as expansions of things like \d 227 */ 228 229static const chr backd[] = { /* \d */ 230 CHR('['), CHR('['), CHR(':'), 231 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 232 CHR(':'), CHR(']'), CHR(']') 233}; 234static const chr backD[] = { /* \D */ 235 CHR('['), CHR('^'), CHR('['), CHR(':'), 236 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 237 CHR(':'), CHR(']'), CHR(']') 238}; 239static const chr brbackd[] = { /* \d within brackets */ 240 CHR('['), CHR(':'), 241 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 242 CHR(':'), CHR(']') 243}; 244static const chr backs[] = { /* \s */ 245 CHR('['), CHR('['), CHR(':'), 246 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 247 CHR(':'), CHR(']'), CHR(']') 248}; 249static const chr backS[] = { /* \S */ 250 CHR('['), CHR('^'), CHR('['), CHR(':'), 251 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 252 CHR(':'), CHR(']'), CHR(']') 253}; 254static const chr brbacks[] = { /* \s within brackets */ 255 CHR('['), CHR(':'), 256 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 257 CHR(':'), CHR(']') 258}; 259static const chr backw[] = { /* \w */ 260 CHR('['), CHR('['), CHR(':'), 261 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 262 CHR(':'), CHR(']'), CHR('_'), CHR(']') 263}; 264static const chr backW[] = { /* \W */ 265 CHR('['), CHR('^'), CHR('['), CHR(':'), 266 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 267 CHR(':'), CHR(']'), CHR('_'), CHR(']') 268}; 269static const chr brbackw[] = { /* \w within brackets */ 270 CHR('['), CHR(':'), 271 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 272 CHR(':'), CHR(']'), CHR('_') 273}; 274 275/* 276 - lexword - interpolate a bracket expression for word characters 277 * Possibly ought to inquire whether there is a "word" character class. 278 ^ static VOID lexword(struct vars *); 279 */ 280static void 281lexword( 282 struct vars *v) 283{ 284 lexnest(v, backw, ENDOF(backw)); 285} 286 287/* 288 - next - get next token 289 ^ static int next(struct vars *); 290 */ 291static int /* 1 normal, 0 failure */ 292next( 293 struct vars *v) 294{ 295 chr c; 296 297 /* 298 * Errors yield an infinite sequence of failures. 299 */ 300 301 if (ISERR()) { 302 return 0; /* the error has set nexttype to EOS */ 303 } 304 305 /* 306 * Remember flavor of last token. 307 */ 308 309 v->lasttype = v->nexttype; 310 311 /* 312 * REG_BOSONLY 313 */ 314 315 if (v->nexttype == EMPTY && (v->cflags®_BOSONLY)) { 316 /* at start of a REG_BOSONLY RE */ 317 RETV(SBEGIN, 0); /* same as \A */ 318 } 319 320 /* 321 * If we're nested and we've hit end, return to outer level. 322 */ 323 324 if (v->savenow != NULL && ATEOS()) { 325 v->now = v->savenow; 326 v->stop = v->savestop; 327 v->savenow = v->savestop = NULL; 328 } 329 330 /* 331 * Skip white space etc. if appropriate (not in literal or []) 332 */ 333 334 if (v->cflags®_EXPANDED) { 335 switch (v->lexcon) { 336 case L_ERE: 337 case L_BRE: 338 case L_EBND: 339 case L_BBND: 340 skip(v); 341 break; 342 } 343 } 344 345 /* 346 * Handle EOS, depending on context. 347 */ 348 349 if (ATEOS()) { 350 switch (v->lexcon) { 351 case L_ERE: 352 case L_BRE: 353 case L_Q: 354 RET(EOS); 355 break; 356 case L_EBND: 357 case L_BBND: 358 FAILW(REG_EBRACE); 359 break; 360 case L_BRACK: 361 case L_CEL: 362 case L_ECL: 363 case L_CCL: 364 FAILW(REG_EBRACK); 365 break; 366 } 367 assert(NOTREACHED); 368 } 369 370 /* 371 * Okay, time to actually get a character. 372 */ 373 374 c = *v->now++; 375 376 /* 377 * Deal with the easy contexts, punt EREs to code below. 378 */ 379 380 switch (v->lexcon) { 381 case L_BRE: /* punt BREs to separate function */ 382 return brenext(v, c); 383 break; 384 case L_ERE: /* see below */ 385 break; 386 case L_Q: /* literal strings are easy */ 387 RETV(PLAIN, c); 388 break; 389 case L_BBND: /* bounds are fairly simple */ 390 case L_EBND: 391 switch (c) { 392 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): 393 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): 394 case CHR('8'): case CHR('9'): 395 RETV(DIGIT, (chr)DIGITVAL(c)); 396 break; 397 case CHR(','): 398 RET(','); 399 break; 400 case CHR('}'): /* ERE bound ends with } */ 401 if (INCON(L_EBND)) { 402 INTOCON(L_ERE); 403 if ((v->cflags®_ADVF) && NEXT1('?')) { 404 v->now++; 405 NOTE(REG_UNONPOSIX); 406 RETV('}', 0); 407 } 408 RETV('}', 1); 409 } else { 410 FAILW(REG_BADBR); 411 } 412 break; 413 case CHR('\\'): /* BRE bound ends with \} */ 414 if (INCON(L_BBND) && NEXT1('}')) { 415 v->now++; 416 INTOCON(L_BRE); 417 RET('}'); 418 } else { 419 FAILW(REG_BADBR); 420 } 421 break; 422 default: 423 FAILW(REG_BADBR); 424 break; 425 } 426 assert(NOTREACHED); 427 break; 428 case L_BRACK: /* brackets are not too hard */ 429 switch (c) { 430 case CHR(']'): 431 if (LASTTYPE('[')) { 432 RETV(PLAIN, c); 433 } else { 434 INTOCON((v->cflags®_EXTENDED) ? L_ERE : L_BRE); 435 RET(']'); 436 } 437 break; 438 case CHR('\\'): 439 NOTE(REG_UBBS); 440 if (!(v->cflags®_ADVF)) { 441 RETV(PLAIN, c); 442 } 443 NOTE(REG_UNONPOSIX); 444 if (ATEOS()) { 445 FAILW(REG_EESCAPE); 446 } 447 (DISCARD)lexescape(v); 448 switch (v->nexttype) { /* not all escapes okay here */ 449 case PLAIN: 450 return 1; 451 break; 452 case CCLASS: 453 switch (v->nextvalue) { 454 case 'd': 455 lexnest(v, brbackd, ENDOF(brbackd)); 456 break; 457 case 's': 458 lexnest(v, brbacks, ENDOF(brbacks)); 459 break; 460 case 'w': 461 lexnest(v, brbackw, ENDOF(brbackw)); 462 break; 463 default: 464 FAILW(REG_EESCAPE); 465 break; 466 } 467 468 /* 469 * lexnest() done, back up and try again. 470 */ 471 472 v->nexttype = v->lasttype; 473 return next(v); 474 break; 475 } 476 477 /* 478 * Not one of the acceptable escapes. 479 */ 480 481 FAILW(REG_EESCAPE); 482 break; 483 case CHR('-'): 484 if (LASTTYPE('[') || NEXT1(']')) { 485 RETV(PLAIN, c); 486 } else { 487 RETV(RANGE, c); 488 } 489 break; 490 case CHR('['): 491 if (ATEOS()) { 492 FAILW(REG_EBRACK); 493 } 494 switch (*v->now++) { 495 case CHR('.'): 496 INTOCON(L_CEL); 497 498 /* 499 * Might or might not be locale-specific. 500 */ 501 502 RET(COLLEL); 503 break; 504 case CHR('='): 505 INTOCON(L_ECL); 506 NOTE(REG_ULOCALE); 507 RET(ECLASS); 508 break; 509 case CHR(':'): 510 INTOCON(L_CCL); 511 NOTE(REG_ULOCALE); 512 RET(CCLASS); 513 break; 514 default: /* oops */ 515 v->now--; 516 RETV(PLAIN, c); 517 break; 518 } 519 assert(NOTREACHED); 520 break; 521 default: 522 RETV(PLAIN, c); 523 break; 524 } 525 assert(NOTREACHED); 526 break; 527 case L_CEL: /* collating elements are easy */ 528 if (c == CHR('.') && NEXT1(']')) { 529 v->now++; 530 INTOCON(L_BRACK); 531 RETV(END, '.'); 532 } else { 533 RETV(PLAIN, c); 534 } 535 break; 536 case L_ECL: /* ditto equivalence classes */ 537 if (c == CHR('=') && NEXT1(']')) { 538 v->now++; 539 INTOCON(L_BRACK); 540 RETV(END, '='); 541 } else { 542 RETV(PLAIN, c); 543 } 544 break; 545 case L_CCL: /* ditto character classes */ 546 if (c == CHR(':') && NEXT1(']')) { 547 v->now++; 548 INTOCON(L_BRACK); 549 RETV(END, ':'); 550 } else { 551 RETV(PLAIN, c); 552 } 553 break; 554 default: 555 assert(NOTREACHED); 556 break; 557 } 558 559 /* 560 * That got rid of everything except EREs and AREs. 561 */ 562 563 assert(INCON(L_ERE)); 564 565 /* 566 * Deal with EREs and AREs, except for backslashes. 567 */ 568 569 switch (c) { 570 case CHR('|'): 571 RET('|'); 572 break; 573 case CHR('*'): 574 if ((v->cflags®_ADVF) && NEXT1('?')) { 575 v->now++; 576 NOTE(REG_UNONPOSIX); 577 RETV('*', 0); 578 } 579 RETV('*', 1); 580 break; 581 case CHR('+'): 582 if ((v->cflags®_ADVF) && NEXT1('?')) { 583 v->now++; 584 NOTE(REG_UNONPOSIX); 585 RETV('+', 0); 586 } 587 RETV('+', 1); 588 break; 589 case CHR('?'): 590 if ((v->cflags®_ADVF) && NEXT1('?')) { 591 v->now++; 592 NOTE(REG_UNONPOSIX); 593 RETV('?', 0); 594 } 595 RETV('?', 1); 596 break; 597 case CHR('{'): /* bounds start or plain character */ 598 if (v->cflags®_EXPANDED) { 599 skip(v); 600 } 601 if (ATEOS() || !iscdigit(*v->now)) { 602 NOTE(REG_UBRACES); 603 NOTE(REG_UUNSPEC); 604 RETV(PLAIN, c); 605 } else { 606 NOTE(REG_UBOUNDS); 607 INTOCON(L_EBND); 608 RET('{'); 609 } 610 assert(NOTREACHED); 611 break; 612 case CHR('('): /* parenthesis, or advanced extension */ 613 if ((v->cflags®_ADVF) && NEXT1('?')) { 614 NOTE(REG_UNONPOSIX); 615 v->now++; 616 switch (*v->now++) { 617 case CHR(':'): /* non-capturing paren */ 618 RETV('(', 0); 619 break; 620 case CHR('#'): /* comment */ 621 while (!ATEOS() && *v->now != CHR(')')) { 622 v->now++; 623 } 624 if (!ATEOS()) { 625 v->now++; 626 } 627 assert(v->nexttype == v->lasttype); 628 return next(v); 629 break; 630 case CHR('='): /* positive lookahead */ 631 NOTE(REG_ULOOKAHEAD); 632 RETV(LACON, 1); 633 break; 634 case CHR('!'): /* negative lookahead */ 635 NOTE(REG_ULOOKAHEAD); 636 RETV(LACON, 0); 637 break; 638 default: 639 FAILW(REG_BADRPT); 640 break; 641 } 642 assert(NOTREACHED); 643 } 644 if (v->cflags®_NOSUB) { 645 RETV('(', 0); /* all parens non-capturing */ 646 } else { 647 RETV('(', 1); 648 } 649 break; 650 case CHR(')'): 651 if (LASTTYPE('(')) { 652 NOTE(REG_UUNSPEC); 653 } 654 RETV(')', c); 655 break; 656 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ 657 if (HAVE(6) && *(v->now+0) == CHR('[') && 658 *(v->now+1) == CHR(':') && 659 (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) && 660 *(v->now+3) == CHR(':') && 661 *(v->now+4) == CHR(']') && 662 *(v->now+5) == CHR(']')) { 663 c = *(v->now+2); 664 v->now += 6; 665 NOTE(REG_UNONPOSIX); 666 RET((c == CHR('<')) ? '<' : '>'); 667 } 668 INTOCON(L_BRACK); 669 if (NEXT1('^')) { 670 v->now++; 671 RETV('[', 0); 672 } 673 RETV('[', 1); 674 break; 675 case CHR('.'): 676 RET('.'); 677 break; 678 case CHR('^'): 679 RET('^'); 680 break; 681 case CHR('$'): 682 RET('$'); 683 break; 684 case CHR('\\'): /* mostly punt backslashes to code below */ 685 if (ATEOS()) { 686 FAILW(REG_EESCAPE); 687 } 688 break; 689 default: /* ordinary character */ 690 RETV(PLAIN, c); 691 break; 692 } 693 694 /* 695 * ERE/ARE backslash handling; backslash already eaten. 696 */ 697 698 assert(!ATEOS()); 699 if (!(v->cflags®_ADVF)) {/* only AREs have non-trivial escapes */ 700 if (iscalnum(*v->now)) { 701 NOTE(REG_UBSALNUM); 702 NOTE(REG_UUNSPEC); 703 } 704 RETV(PLAIN, *v->now++); 705 } 706 (DISCARD)lexescape(v); 707 if (ISERR()) { 708 FAILW(REG_EESCAPE); 709 } 710 if (v->nexttype == CCLASS) {/* fudge at lexical level */ 711 switch (v->nextvalue) { 712 case 'd': lexnest(v, backd, ENDOF(backd)); break; 713 case 'D': lexnest(v, backD, ENDOF(backD)); break; 714 case 's': lexnest(v, backs, ENDOF(backs)); break; 715 case 'S': lexnest(v, backS, ENDOF(backS)); break; 716 case 'w': lexnest(v, backw, ENDOF(backw)); break; 717 case 'W': lexnest(v, backW, ENDOF(backW)); break; 718 default: 719 assert(NOTREACHED); 720 FAILW(REG_ASSERT); 721 break; 722 } 723 /* lexnest done, back up and try again */ 724 v->nexttype = v->lasttype; 725 return next(v); 726 } 727 728 /* 729 * Otherwise, lexescape has already done the work. 730 */ 731 732 return !ISERR(); 733} 734 735/* 736 - lexescape - parse an ARE backslash escape (backslash already eaten) 737 * Note slightly nonstandard use of the CCLASS type code. 738 ^ static int lexescape(struct vars *); 739 */ 740static int /* not actually used, but convenient for RETV */ 741lexescape( 742 struct vars *v) 743{ 744 chr c; 745 static chr alert[] = { 746 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') 747 }; 748 static chr esc[] = { 749 CHR('E'), CHR('S'), CHR('C') 750 }; 751 const chr *save; 752 753 assert(v->cflags®_ADVF); 754 755 assert(!ATEOS()); 756 c = *v->now++; 757 if (!iscalnum(c)) { 758 RETV(PLAIN, c); 759 } 760 761 NOTE(REG_UNONPOSIX); 762 switch (c) { 763 case CHR('a'): 764 RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); 765 break; 766 case CHR('A'): 767 RETV(SBEGIN, 0); 768 break; 769 case CHR('b'): 770 RETV(PLAIN, CHR('\b')); 771 break; 772 case CHR('B'): 773 RETV(PLAIN, CHR('\\')); 774 break; 775 case CHR('c'): 776 NOTE(REG_UUNPORT); 777 if (ATEOS()) { 778 FAILW(REG_EESCAPE); 779 } 780 RETV(PLAIN, (chr)(*v->now++ & 037)); 781 break; 782 case CHR('d'): 783 NOTE(REG_ULOCALE); 784 RETV(CCLASS, 'd'); 785 break; 786 case CHR('D'): 787 NOTE(REG_ULOCALE); 788 RETV(CCLASS, 'D'); 789 break; 790 case CHR('e'): 791 NOTE(REG_UUNPORT); 792 RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); 793 break; 794 case CHR('f'): 795 RETV(PLAIN, CHR('\f')); 796 break; 797 case CHR('m'): 798 RET('<'); 799 break; 800 case CHR('M'): 801 RET('>'); 802 break; 803 case CHR('n'): 804 RETV(PLAIN, CHR('\n')); 805 break; 806 case CHR('r'): 807 RETV(PLAIN, CHR('\r')); 808 break; 809 case CHR('s'): 810 NOTE(REG_ULOCALE); 811 RETV(CCLASS, 's'); 812 break; 813 case CHR('S'): 814 NOTE(REG_ULOCALE); 815 RETV(CCLASS, 'S'); 816 break; 817 case CHR('t'): 818 RETV(PLAIN, CHR('\t')); 819 break; 820 case CHR('u'): 821 c = lexdigits(v, 16, 4, 4); 822 if (ISERR()) { 823 FAILW(REG_EESCAPE); 824 } 825 RETV(PLAIN, c); 826 break; 827 case CHR('U'): 828 c = lexdigits(v, 16, 8, 8); 829 if (ISERR()) { 830 FAILW(REG_EESCAPE); 831 } 832 RETV(PLAIN, c); 833 break; 834 case CHR('v'): 835 RETV(PLAIN, CHR('\v')); 836 break; 837 case CHR('w'): 838 NOTE(REG_ULOCALE); 839 RETV(CCLASS, 'w'); 840 break; 841 case CHR('W'): 842 NOTE(REG_ULOCALE); 843 RETV(CCLASS, 'W'); 844 break; 845 case CHR('x'): 846 NOTE(REG_UUNPORT); 847 c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ 848 if (ISERR()) { 849 FAILW(REG_EESCAPE); 850 } 851 RETV(PLAIN, c); 852 break; 853 case CHR('y'): 854 NOTE(REG_ULOCALE); 855 RETV(WBDRY, 0); 856 break; 857 case CHR('Y'): 858 NOTE(REG_ULOCALE); 859 RETV(NWBDRY, 0); 860 break; 861 case CHR('Z'): 862 RETV(SEND, 0); 863 break; 864 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): 865 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): 866 case CHR('9'): 867 save = v->now; 868 v->now--; /* put first digit back */ 869 c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ 870 if (ISERR()) { 871 FAILW(REG_EESCAPE); 872 } 873 874 /* 875 * Ugly heuristic (first test is "exactly 1 digit?") 876 */ 877 878 if (v->now - save == 0 || ((int) c > 0 && (int)c <= v->nsubexp)) { 879 NOTE(REG_UBACKREF); 880 RETV(BACKREF, (chr)c); 881 } 882 883 /* 884 * Oops, doesn't look like it's a backref after all... 885 */ 886 887 v->now = save; 888 889 /* 890 * And fall through into octal number. 891 */ 892 893 case CHR('0'): 894 NOTE(REG_UUNPORT); 895 v->now--; /* put first digit back */ 896 c = lexdigits(v, 8, 1, 3); 897 if (ISERR()) { 898 FAILW(REG_EESCAPE); 899 } 900 RETV(PLAIN, c); 901 break; 902 default: 903 assert(iscalpha(c)); 904 FAILW(REG_EESCAPE); /* unknown alphabetic escape */ 905 break; 906 } 907 assert(NOTREACHED); 908} 909 910/* 911 - lexdigits - slurp up digits and return chr value 912 ^ static chr lexdigits(struct vars *, int, int, int); 913 */ 914static chr /* chr value; errors signalled via ERR */ 915lexdigits( 916 struct vars *v, 917 int base, 918 int minlen, 919 int maxlen) 920{ 921 uchr n; /* unsigned to avoid overflow misbehavior */ 922 int len; 923 chr c; 924 int d; 925 CONST uchr ub = (uchr) base; 926 927 n = 0; 928 for (len = 0; len < maxlen && !ATEOS(); len++) { 929 c = *v->now++; 930 switch (c) { 931 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): 932 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): 933 case CHR('8'): case CHR('9'): 934 d = DIGITVAL(c); 935 break; 936 case CHR('a'): case CHR('A'): d = 10; break; 937 case CHR('b'): case CHR('B'): d = 11; break; 938 case CHR('c'): case CHR('C'): d = 12; break; 939 case CHR('d'): case CHR('D'): d = 13; break; 940 case CHR('e'): case CHR('E'): d = 14; break; 941 case CHR('f'): case CHR('F'): d = 15; break; 942 default: 943 v->now--; /* oops, not a digit at all */ 944 d = -1; 945 break; 946 } 947 948 if (d >= base) { /* not a plausible digit */ 949 v->now--; 950 d = -1; 951 } 952 if (d < 0) { 953 break; /* NOTE BREAK OUT */ 954 } 955 n = n*ub + (uchr)d; 956 } 957 if (len < minlen) { 958 ERR(REG_EESCAPE); 959 } 960 961 return (chr)n; 962} 963 964/* 965 - brenext - get next BRE token 966 * This is much like EREs except for all the stupid backslashes and the 967 * context-dependency of some things. 968 ^ static int brenext(struct vars *, pchr); 969 */ 970static int /* 1 normal, 0 failure */ 971brenext( 972 struct vars *v, 973 pchr pc) 974{ 975 chr c = (chr)pc; 976 977 switch (c) { 978 case CHR('*'): 979 if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) { 980 RETV(PLAIN, c); 981 } 982 RET('*'); 983 break; 984 case CHR('['): 985 if (HAVE(6) && *(v->now+0) == CHR('[') && 986 *(v->now+1) == CHR(':') && 987 (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) && 988 *(v->now+3) == CHR(':') && 989 *(v->now+4) == CHR(']') && 990 *(v->now+5) == CHR(']')) { 991 c = *(v->now+2); 992 v->now += 6; 993 NOTE(REG_UNONPOSIX); 994 RET((c == CHR('<')) ? '<' : '>'); 995 } 996 INTOCON(L_BRACK); 997 if (NEXT1('^')) { 998 v->now++; 999 RETV('[', 0); 1000 } 1001 RETV('[', 1); 1002 break; 1003 case CHR('.'): 1004 RET('.'); 1005 break; 1006 case CHR('^'): 1007 if (LASTTYPE(EMPTY)) { 1008 RET('^'); 1009 } 1010 if (LASTTYPE('(')) { 1011 NOTE(REG_UUNSPEC); 1012 RET('^'); 1013 } 1014 RETV(PLAIN, c); 1015 break; 1016 case CHR('$'): 1017 if (v->cflags®_EXPANDED) { 1018 skip(v); 1019 } 1020 if (ATEOS()) { 1021 RET('$'); 1022 } 1023 if (NEXT2('\\', ')')) { 1024 NOTE(REG_UUNSPEC); 1025 RET('$'); 1026 } 1027 RETV(PLAIN, c); 1028 break; 1029 case CHR('\\'): 1030 break; /* see below */ 1031 default: 1032 RETV(PLAIN, c); 1033 break; 1034 } 1035 1036 assert(c == CHR('\\')); 1037 1038 if (ATEOS()) { 1039 FAILW(REG_EESCAPE); 1040 } 1041 1042 c = *v->now++; 1043 switch (c) { 1044 case CHR('{'): 1045 INTOCON(L_BBND); 1046 NOTE(REG_UBOUNDS); 1047 RET('{'); 1048 break; 1049 case CHR('('): 1050 RETV('(', 1); 1051 break; 1052 case CHR(')'): 1053 RETV(')', c); 1054 break; 1055 case CHR('<'): 1056 NOTE(REG_UNONPOSIX); 1057 RET('<'); 1058 break; 1059 case CHR('>'): 1060 NOTE(REG_UNONPOSIX); 1061 RET('>'); 1062 break; 1063 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): 1064 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): 1065 case CHR('9'): 1066 NOTE(REG_UBACKREF); 1067 RETV(BACKREF, (chr)DIGITVAL(c)); 1068 break; 1069 default: 1070 if (iscalnum(c)) { 1071 NOTE(REG_UBSALNUM); 1072 NOTE(REG_UUNSPEC); 1073 } 1074 RETV(PLAIN, c); 1075 break; 1076 } 1077 1078 assert(NOTREACHED); 1079} 1080 1081/* 1082 - skip - skip white space and comments in expanded form 1083 ^ static VOID skip(struct vars *); 1084 */ 1085static void 1086skip( 1087 struct vars *v) 1088{ 1089 const chr *start = v->now; 1090 1091 assert(v->cflags®_EXPANDED); 1092 1093 for (;;) { 1094 while (!ATEOS() && iscspace(*v->now)) { 1095 v->now++; 1096 } 1097 if (ATEOS() || *v->now != CHR('#')) { 1098 break; /* NOTE BREAK OUT */ 1099 } 1100 assert(NEXT1('#')); 1101 while (!ATEOS() && *v->now != CHR('\n')) { 1102 v->now++; 1103 } 1104 1105 /* 1106 * Leave the newline to be picked up by the iscspace loop. 1107 */ 1108 } 1109 1110 if (v->now != start) { 1111 NOTE(REG_UNONPOSIX); 1112 } 1113} 1114 1115/* 1116 - newline - return the chr for a newline 1117 * This helps confine use of CHR to this source file. 1118 ^ static chr newline(NOPARMS); 1119 */ 1120static chr 1121newline(void) 1122{ 1123 return CHR('\n'); 1124} 1125 1126/* 1127 - ch - return the chr sequence for regc_locale.c's fake collating element ch 1128 * This helps confine use of CHR to this source file. Beware that the caller 1129 * knows how long the sequence is. 1130 ^ #ifdef REG_DEBUG 1131 ^ static const chr *ch(NOPARMS); 1132 ^ #endif 1133 */ 1134#ifdef REG_DEBUG 1135static const chr * 1136ch(void) 1137{ 1138 static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') }; 1139 1140 return chstr; 1141} 1142#endif 1143 1144/* 1145 - chrnamed - return the chr known by a given (chr string) name 1146 * The code is a bit clumsy, but this routine gets only such specialized 1147 * use that it hardly matters. 1148 ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr); 1149 */ 1150static chr 1151chrnamed( 1152 struct vars *v, 1153 const chr *startp, /* start of name */ 1154 const chr *endp, /* just past end of name */ 1155 pchr lastresort) /* what to return if name lookup fails */ 1156{ 1157 celt c; 1158 int errsave; 1159 int e; 1160 struct cvec *cv; 1161 1162 errsave = v->err; 1163 v->err = 0; 1164 c = element(v, startp, endp); 1165 e = v->err; 1166 v->err = errsave; 1167 1168 if (e != 0) { 1169 return (chr)lastresort; 1170 } 1171 1172 cv = range(v, c, c, 0); 1173 if (cv->nchrs == 0) { 1174 return (chr)lastresort; 1175 } 1176 return cv->chrs[0]; 1177} 1178 1179/* 1180 * Local Variables: 1181 * mode: c 1182 * c-basic-offset: 4 1183 * fill-column: 78 1184 * End: 1185 */ 1186