1/* 2 * lexical analyzer 3 * This file is #included by regcomp.c. 4 * 5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. 6 * 7 * Development of this software was funded, in part, by Cray Research Inc., 8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 9 * Corporation, none of whom are responsible for the results. The author 10 * thanks all of them. 11 * 12 * Redistribution and use in source and binary forms -- with or without 13 * modification -- are permitted for any purpose, provided that 14 * redistributions in source form retain this entire copyright notice and 15 * indicate the origin and nature of any modifications. 16 * 17 * I'd appreciate being given credit for this package in the documentation 18 * of software which uses it, but that is not a requirement. 19 * 20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33/* scanning macros (know about v) */ 34#define ATEOS() (v->now >= v->stop) 35#define HAVE(n) (v->stop - v->now >= (n)) 36#define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) 37#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) 38#define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \ 39 *(v->now+1) == CHR(b) && \ 40 *(v->now+2) == CHR(c)) 41#define SET(c) (v->nexttype = (c)) 42#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) 43#define RET(c) return (SET(c), 1) 44#define RETV(c, n) return (SETV(c, n), 1) 45#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ 46#define LASTTYPE(t) (v->lasttype == (t)) 47 48/* lexical contexts */ 49#define L_ERE 1 /* mainline ERE/ARE */ 50#define L_BRE 2 /* mainline BRE */ 51#define L_Q 3 /* REG_QUOTE */ 52#define L_EBND 4 /* ERE/ARE bound */ 53#define L_BBND 5 /* BRE bound */ 54#define L_BRACK 6 /* brackets */ 55#define L_CEL 7 /* collating element */ 56#define L_ECL 8 /* equivalence class */ 57#define L_CCL 9 /* character class */ 58#define INTOCON(c) (v->lexcon = (c)) 59#define INCON(con) (v->lexcon == (con)) 60 61/* construct pointer past end of chr array */ 62#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) 63 64/* 65 - lexstart - set up lexical stuff, scan leading options 66 ^ static VOID lexstart(struct vars *); 67 */ 68static VOID 69lexstart(v) 70struct vars *v; 71{ 72 prefixes(v); /* may turn on new type bits etc. */ 73 NOERR(); 74 75 if (v->cflags®_QUOTE) { 76 assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))); 77 INTOCON(L_Q); 78 } else if (v->cflags®_EXTENDED) { 79 assert(!(v->cflags®_QUOTE)); 80 INTOCON(L_ERE); 81 } else { 82 assert(!(v->cflags&(REG_QUOTE|REG_ADVF))); 83 INTOCON(L_BRE); 84 } 85 86 v->nexttype = EMPTY; /* remember we were at the start */ 87 next(v); /* set up the first token */ 88} 89 90/* 91 - prefixes - implement various special prefixes 92 ^ static VOID prefixes(struct vars *); 93 */ 94static VOID 95prefixes(v) 96struct vars *v; 97{ 98 /* literal string doesn't get any of this stuff */ 99 if (v->cflags®_QUOTE) 100 return; 101 102 /* initial "***" gets special things */ 103 if (HAVE(4) && NEXT3('*', '*', '*')) 104 switch (*(v->now + 3)) { 105 case CHR('?'): /* "***?" error, msg shows version */ 106 ERR(REG_BADPAT); 107 return; /* proceed no further */ 108 break; 109 case CHR('='): /* "***=" shifts to literal string */ 110 NOTE(REG_UNONPOSIX); 111 v->cflags |= REG_QUOTE; 112 v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE); 113 v->now += 4; 114 return; /* and there can be no more prefixes */ 115 break; 116 case CHR(':'): /* "***:" shifts to AREs */ 117 NOTE(REG_UNONPOSIX); 118 v->cflags |= REG_ADVANCED; 119 v->now += 4; 120 break; 121 default: /* otherwise *** is just an error */ 122 ERR(REG_BADRPT); 123 return; 124 break; 125 } 126 127 /* BREs and EREs don't get embedded options */ 128 if ((v->cflags®_ADVANCED) != REG_ADVANCED) 129 return; 130 131 /* embedded options (AREs only) */ 132 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) { 133 NOTE(REG_UNONPOSIX); 134 v->now += 2; 135 for (; !ATEOS() && iscalpha(*v->now); v->now++) 136 switch (*v->now) { 137 case CHR('b'): /* BREs (but why???) */ 138 v->cflags &= ~(REG_ADVANCED|REG_QUOTE); 139 break; 140 case CHR('c'): /* case sensitive */ 141 v->cflags &= ~REG_ICASE; 142 break; 143 case CHR('e'): /* plain EREs */ 144 v->cflags |= REG_EXTENDED; 145 v->cflags &= ~(REG_ADVF|REG_QUOTE); 146 break; 147 case CHR('i'): /* case insensitive */ 148 v->cflags |= REG_ICASE; 149 break; 150 case CHR('m'): /* Perloid synonym for n */ 151 case CHR('n'): /* \n affects ^ $ . [^ */ 152 v->cflags |= REG_NEWLINE; 153 break; 154 case CHR('p'): /* ~Perl, \n affects . [^ */ 155 v->cflags |= REG_NLSTOP; 156 v->cflags &= ~REG_NLANCH; 157 break; 158 case CHR('q'): /* literal string */ 159 v->cflags |= REG_QUOTE; 160 v->cflags &= ~REG_ADVANCED; 161 break; 162 case CHR('s'): /* single line, \n ordinary */ 163 v->cflags &= ~REG_NEWLINE; 164 break; 165 case CHR('t'): /* tight syntax */ 166 v->cflags &= ~REG_EXPANDED; 167 break; 168 case CHR('w'): /* weird, \n affects ^ $ only */ 169 v->cflags &= ~REG_NLSTOP; 170 v->cflags |= REG_NLANCH; 171 break; 172 case CHR('x'): /* expanded syntax */ 173 v->cflags |= REG_EXPANDED; 174 break; 175 default: 176 ERR(REG_BADOPT); 177 return; 178 } 179 if (!NEXT1(')')) { 180 ERR(REG_BADOPT); 181 return; 182 } 183 v->now++; 184 if (v->cflags®_QUOTE) 185 v->cflags &= ~(REG_EXPANDED|REG_NEWLINE); 186 } 187} 188 189/* 190 - lexnest - "call a subroutine", interpolating string at the lexical level 191 * Note, this is not a very general facility. There are a number of 192 * implicit assumptions about what sorts of strings can be subroutines. 193 ^ static VOID lexnest(struct vars *, chr *, chr *); 194 */ 195static VOID 196lexnest(v, beginp, endp) 197struct vars *v; 198chr *beginp; /* start of interpolation */ 199chr *endp; /* one past end of interpolation */ 200{ 201 assert(v->savenow == NULL); /* only one level of nesting */ 202 v->savenow = v->now; 203 v->savestop = v->stop; 204 v->now = beginp; 205 v->stop = endp; 206} 207 208/* 209 * string constants to interpolate as expansions of things like \d 210 */ 211static chr backd[] = { /* \d */ 212 CHR('['), CHR('['), CHR(':'), 213 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 214 CHR(':'), CHR(']'), CHR(']') 215}; 216static chr backD[] = { /* \D */ 217 CHR('['), CHR('^'), CHR('['), CHR(':'), 218 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 219 CHR(':'), CHR(']'), CHR(']') 220}; 221static chr brbackd[] = { /* \d within brackets */ 222 CHR('['), CHR(':'), 223 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 224 CHR(':'), CHR(']') 225}; 226static chr backs[] = { /* \s */ 227 CHR('['), CHR('['), CHR(':'), 228 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 229 CHR(':'), CHR(']'), CHR(']') 230}; 231static chr backS[] = { /* \S */ 232 CHR('['), CHR('^'), CHR('['), CHR(':'), 233 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 234 CHR(':'), CHR(']'), CHR(']') 235}; 236static chr brbacks[] = { /* \s within brackets */ 237 CHR('['), CHR(':'), 238 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 239 CHR(':'), CHR(']') 240}; 241static chr backw[] = { /* \w */ 242 CHR('['), CHR('['), CHR(':'), 243 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 244 CHR(':'), CHR(']'), CHR('_'), CHR(']') 245}; 246static chr backW[] = { /* \W */ 247 CHR('['), CHR('^'), CHR('['), CHR(':'), 248 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 249 CHR(':'), CHR(']'), CHR('_'), CHR(']') 250}; 251static chr brbackw[] = { /* \w within brackets */ 252 CHR('['), CHR(':'), 253 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 254 CHR(':'), CHR(']'), CHR('_') 255}; 256 257/* 258 - lexword - interpolate a bracket expression for word characters 259 * Possibly ought to inquire whether there is a "word" character class. 260 ^ static VOID lexword(struct vars *); 261 */ 262static VOID 263lexword(v) 264struct vars *v; 265{ 266 lexnest(v, backw, ENDOF(backw)); 267} 268 269/* 270 - next - get next token 271 ^ static int next(struct vars *); 272 */ 273static int /* 1 normal, 0 failure */ 274next(v) 275struct vars *v; 276{ 277 chr c; 278 279 /* errors yield an infinite sequence of failures */ 280 if (ISERR()) 281 return 0; /* the error has set nexttype to EOS */ 282 283 /* remember flavor of last token */ 284 v->lasttype = v->nexttype; 285 286 /* REG_BOSONLY */ 287 if (v->nexttype == EMPTY && (v->cflags®_BOSONLY)) { 288 /* at start of a REG_BOSONLY RE */ 289 RETV(SBEGIN, 0); /* same as \A */ 290 } 291 292 /* if we're nested and we've hit end, return to outer level */ 293 if (v->savenow != NULL && ATEOS()) { 294 v->now = v->savenow; 295 v->stop = v->savestop; 296 v->savenow = v->savestop = NULL; 297 } 298 299 /* skip white space etc. if appropriate (not in literal or []) */ 300 if (v->cflags®_EXPANDED) 301 switch (v->lexcon) { 302 case L_ERE: 303 case L_BRE: 304 case L_EBND: 305 case L_BBND: 306 skip(v); 307 break; 308 } 309 310 /* handle EOS, depending on context */ 311 if (ATEOS()) { 312 switch (v->lexcon) { 313 case L_ERE: 314 case L_BRE: 315 case L_Q: 316 RET(EOS); 317 break; 318 case L_EBND: 319 case L_BBND: 320 FAILW(REG_EBRACE); 321 break; 322 case L_BRACK: 323 case L_CEL: 324 case L_ECL: 325 case L_CCL: 326 FAILW(REG_EBRACK); 327 break; 328 } 329 assert(NOTREACHED); 330 } 331 332 /* okay, time to actually get a character */ 333 c = *v->now++; 334 335 /* deal with the easy contexts, punt EREs to code below */ 336 switch (v->lexcon) { 337 case L_BRE: /* punt BREs to separate function */ 338 return brenext(v, c); 339 break; 340 case L_ERE: /* see below */ 341 break; 342 case L_Q: /* literal strings are easy */ 343 RETV(PLAIN, c); 344 break; 345 case L_BBND: /* bounds are fairly simple */ 346 case L_EBND: 347 switch (c) { 348 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): 349 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): 350 case CHR('8'): case CHR('9'): 351 RETV(DIGIT, (chr)DIGITVAL(c)); 352 break; 353 case CHR(','): 354 RET(','); 355 break; 356 case CHR('}'): /* ERE bound ends with } */ 357 if (INCON(L_EBND)) { 358 INTOCON(L_ERE); 359 if ((v->cflags®_ADVF) && NEXT1('?')) { 360 v->now++; 361 NOTE(REG_UNONPOSIX); 362 RETV('}', 0); 363 } 364 RETV('}', 1); 365 } else 366 FAILW(REG_BADBR); 367 break; 368 case CHR('\\'): /* BRE bound ends with \} */ 369 if (INCON(L_BBND) && NEXT1('}')) { 370 v->now++; 371 INTOCON(L_BRE); 372 RET('}'); 373 } else 374 FAILW(REG_BADBR); 375 break; 376 default: 377 FAILW(REG_BADBR); 378 break; 379 } 380 assert(NOTREACHED); 381 break; 382 case L_BRACK: /* brackets are not too hard */ 383 switch (c) { 384 case CHR(']'): 385 if (LASTTYPE('[')) 386 RETV(PLAIN, c); 387 else { 388 INTOCON((v->cflags®_EXTENDED) ? 389 L_ERE : L_BRE); 390 RET(']'); 391 } 392 break; 393 case CHR('\\'): 394 NOTE(REG_UBBS); 395 if (!(v->cflags®_ADVF)) 396 RETV(PLAIN, c); 397 NOTE(REG_UNONPOSIX); 398 if (ATEOS()) 399 FAILW(REG_EESCAPE); 400 (DISCARD)lexescape(v); 401 switch (v->nexttype) { /* not all escapes okay here */ 402 case PLAIN: 403 return 1; 404 break; 405 case CCLASS: 406 switch (v->nextvalue) { 407 case 'd': 408 lexnest(v, brbackd, ENDOF(brbackd)); 409 break; 410 case 's': 411 lexnest(v, brbacks, ENDOF(brbacks)); 412 break; 413 case 'w': 414 lexnest(v, brbackw, ENDOF(brbackw)); 415 break; 416 default: 417 FAILW(REG_EESCAPE); 418 break; 419 } 420 /* lexnest done, back up and try again */ 421 v->nexttype = v->lasttype; 422 return next(v); 423 break; 424 } 425 /* not one of the acceptable escapes */ 426 FAILW(REG_EESCAPE); 427 break; 428 case CHR('-'): 429 if (LASTTYPE('[') || NEXT1(']')) 430 RETV(PLAIN, c); 431 else 432 RETV(RANGE, c); 433 break; 434 case CHR('['): 435 if (ATEOS()) 436 FAILW(REG_EBRACK); 437 switch (*v->now++) { 438 case CHR('.'): 439 INTOCON(L_CEL); 440 /* might or might not be locale-specific */ 441 RET(COLLEL); 442 break; 443 case CHR('='): 444 INTOCON(L_ECL); 445 NOTE(REG_ULOCALE); 446 RET(ECLASS); 447 break; 448 case CHR(':'): 449 INTOCON(L_CCL); 450 NOTE(REG_ULOCALE); 451 RET(CCLASS); 452 break; 453 default: /* oops */ 454 v->now--; 455 RETV(PLAIN, c); 456 break; 457 } 458 assert(NOTREACHED); 459 break; 460 default: 461 RETV(PLAIN, c); 462 break; 463 } 464 assert(NOTREACHED); 465 break; 466 case L_CEL: /* collating elements are easy */ 467 if (c == CHR('.') && NEXT1(']')) { 468 v->now++; 469 INTOCON(L_BRACK); 470 RETV(END, '.'); 471 } else 472 RETV(PLAIN, c); 473 break; 474 case L_ECL: /* ditto equivalence classes */ 475 if (c == CHR('=') && NEXT1(']')) { 476 v->now++; 477 INTOCON(L_BRACK); 478 RETV(END, '='); 479 } else 480 RETV(PLAIN, c); 481 break; 482 case L_CCL: /* ditto character classes */ 483 if (c == CHR(':') && NEXT1(']')) { 484 v->now++; 485 INTOCON(L_BRACK); 486 RETV(END, ':'); 487 } else 488 RETV(PLAIN, c); 489 break; 490 default: 491 assert(NOTREACHED); 492 break; 493 } 494 495 /* that got rid of everything except EREs and AREs */ 496 assert(INCON(L_ERE)); 497 498 /* deal with EREs and AREs, except for backslashes */ 499 switch (c) { 500 case CHR('|'): 501 RET('|'); 502 break; 503 case CHR('*'): 504 if ((v->cflags®_ADVF) && NEXT1('?')) { 505 v->now++; 506 NOTE(REG_UNONPOSIX); 507 RETV('*', 0); 508 } 509 RETV('*', 1); 510 break; 511 case CHR('+'): 512 if ((v->cflags®_ADVF) && NEXT1('?')) { 513 v->now++; 514 NOTE(REG_UNONPOSIX); 515 RETV('+', 0); 516 } 517 RETV('+', 1); 518 break; 519 case CHR('?'): 520 if ((v->cflags®_ADVF) && NEXT1('?')) { 521 v->now++; 522 NOTE(REG_UNONPOSIX); 523 RETV('?', 0); 524 } 525 RETV('?', 1); 526 break; 527 case CHR('{'): /* bounds start or plain character */ 528 if (v->cflags®_EXPANDED) 529 skip(v); 530 if (ATEOS() || !iscdigit(*v->now)) { 531 NOTE(REG_UBRACES); 532 NOTE(REG_UUNSPEC); 533 RETV(PLAIN, c); 534 } else { 535 NOTE(REG_UBOUNDS); 536 INTOCON(L_EBND); 537 RET('{'); 538 } 539 assert(NOTREACHED); 540 break; 541 case CHR('('): /* parenthesis, or advanced extension */ 542 if ((v->cflags®_ADVF) && NEXT1('?')) { 543 NOTE(REG_UNONPOSIX); 544 v->now++; 545 switch (*v->now++) { 546 case CHR(':'): /* non-capturing paren */ 547 RETV('(', 0); 548 break; 549 case CHR('#'): /* comment */ 550 while (!ATEOS() && *v->now != CHR(')')) 551 v->now++; 552 if (!ATEOS()) 553 v->now++; 554 assert(v->nexttype == v->lasttype); 555 return next(v); 556 break; 557 case CHR('='): /* positive lookahead */ 558 NOTE(REG_ULOOKAHEAD); 559 RETV(LACON, 1); 560 break; 561 case CHR('!'): /* negative lookahead */ 562 NOTE(REG_ULOOKAHEAD); 563 RETV(LACON, 0); 564 break; 565 default: 566 FAILW(REG_BADRPT); 567 break; 568 } 569 assert(NOTREACHED); 570 } 571 if (v->cflags®_NOSUB) 572 RETV('(', 0); /* all parens non-capturing */ 573 else 574 RETV('(', 1); 575 break; 576 case CHR(')'): 577 if (LASTTYPE('(')) { 578 NOTE(REG_UUNSPEC); 579 } 580 RETV(')', c); 581 break; 582 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ 583 if (HAVE(6) && *(v->now+0) == CHR('[') && 584 *(v->now+1) == CHR(':') && 585 (*(v->now+2) == CHR('<') || 586 *(v->now+2) == CHR('>')) && 587 *(v->now+3) == CHR(':') && 588 *(v->now+4) == CHR(']') && 589 *(v->now+5) == CHR(']')) { 590 c = *(v->now+2); 591 v->now += 6; 592 NOTE(REG_UNONPOSIX); 593 RET((c == CHR('<')) ? '<' : '>'); 594 } 595 INTOCON(L_BRACK); 596 if (NEXT1('^')) { 597 v->now++; 598 RETV('[', 0); 599 } 600 RETV('[', 1); 601 break; 602 case CHR('.'): 603 RET('.'); 604 break; 605 case CHR('^'): 606 RET('^'); 607 break; 608 case CHR('$'): 609 RET('$'); 610 break; 611 case CHR('\\'): /* mostly punt backslashes to code below */ 612 if (ATEOS()) 613 FAILW(REG_EESCAPE); 614 break; 615 default: /* ordinary character */ 616 RETV(PLAIN, c); 617 break; 618 } 619 620 /* ERE/ARE backslash handling; backslash already eaten */ 621 assert(!ATEOS()); 622 if (!(v->cflags®_ADVF)) { /* only AREs have non-trivial escapes */ 623 if (iscalnum(*v->now)) { 624 NOTE(REG_UBSALNUM); 625 NOTE(REG_UUNSPEC); 626 } 627 RETV(PLAIN, *v->now++); 628 } 629 (DISCARD)lexescape(v); 630 if (ISERR()) 631 FAILW(REG_EESCAPE); 632 if (v->nexttype == CCLASS) { /* fudge at lexical level */ 633 switch (v->nextvalue) { 634 case 'd': lexnest(v, backd, ENDOF(backd)); break; 635 case 'D': lexnest(v, backD, ENDOF(backD)); break; 636 case 's': lexnest(v, backs, ENDOF(backs)); break; 637 case 'S': lexnest(v, backS, ENDOF(backS)); break; 638 case 'w': lexnest(v, backw, ENDOF(backw)); break; 639 case 'W': lexnest(v, backW, ENDOF(backW)); break; 640 default: 641 assert(NOTREACHED); 642 FAILW(REG_ASSERT); 643 break; 644 } 645 /* lexnest done, back up and try again */ 646 v->nexttype = v->lasttype; 647 return next(v); 648 } 649 /* otherwise, lexescape has already done the work */ 650 return !ISERR(); 651} 652 653/* 654 - lexescape - parse an ARE backslash escape (backslash already eaten) 655 * Note slightly nonstandard use of the CCLASS type code. 656 ^ static int lexescape(struct vars *); 657 */ 658static int /* not actually used, but convenient for RETV */ 659lexescape(v) 660struct vars *v; 661{ 662 chr c; 663 static chr alert[] = { 664 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') 665 }; 666 static chr esc[] = { 667 CHR('E'), CHR('S'), CHR('C') 668 }; 669 chr *save; 670 671 assert(v->cflags®_ADVF); 672 673 assert(!ATEOS()); 674 c = *v->now++; 675 if (!iscalnum(c)) 676 RETV(PLAIN, c); 677 678 NOTE(REG_UNONPOSIX); 679 switch (c) { 680 case CHR('a'): 681 RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); 682 break; 683 case CHR('A'): 684 RETV(SBEGIN, 0); 685 break; 686 case CHR('b'): 687 RETV(PLAIN, CHR('\b')); 688 break; 689 case CHR('B'): 690 RETV(PLAIN, CHR('\\')); 691 break; 692 case CHR('c'): 693 NOTE(REG_UUNPORT); 694 if (ATEOS()) 695 FAILW(REG_EESCAPE); 696 RETV(PLAIN, (chr)(*v->now++ & 037)); 697 break; 698 case CHR('d'): 699 NOTE(REG_ULOCALE); 700 RETV(CCLASS, 'd'); 701 break; 702 case CHR('D'): 703 NOTE(REG_ULOCALE); 704 RETV(CCLASS, 'D'); 705 break; 706 case CHR('e'): 707 NOTE(REG_UUNPORT); 708 RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); 709 break; 710 case CHR('f'): 711 RETV(PLAIN, CHR('\f')); 712 break; 713 case CHR('m'): 714 RET('<'); 715 break; 716 case CHR('M'): 717 RET('>'); 718 break; 719 case CHR('n'): 720 RETV(PLAIN, CHR('\n')); 721 break; 722 case CHR('r'): 723 RETV(PLAIN, CHR('\r')); 724 break; 725 case CHR('s'): 726 NOTE(REG_ULOCALE); 727 RETV(CCLASS, 's'); 728 break; 729 case CHR('S'): 730 NOTE(REG_ULOCALE); 731 RETV(CCLASS, 'S'); 732 break; 733 case CHR('t'): 734 RETV(PLAIN, CHR('\t')); 735 break; 736 case CHR('u'): 737 c = lexdigits(v, 16, 4, 4); 738 if (ISERR()) 739 FAILW(REG_EESCAPE); 740 RETV(PLAIN, c); 741 break; 742 case CHR('U'): 743 c = lexdigits(v, 16, 8, 8); 744 if (ISERR()) 745 FAILW(REG_EESCAPE); 746 RETV(PLAIN, c); 747 break; 748 case CHR('v'): 749 RETV(PLAIN, CHR('\v')); 750 break; 751 case CHR('w'): 752 NOTE(REG_ULOCALE); 753 RETV(CCLASS, 'w'); 754 break; 755 case CHR('W'): 756 NOTE(REG_ULOCALE); 757 RETV(CCLASS, 'W'); 758 break; 759 case CHR('x'): 760 NOTE(REG_UUNPORT); 761 c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ 762 if (ISERR()) 763 FAILW(REG_EESCAPE); 764 RETV(PLAIN, c); 765 break; 766 case CHR('y'): 767 NOTE(REG_ULOCALE); 768 RETV(WBDRY, 0); 769 break; 770 case CHR('Y'): 771 NOTE(REG_ULOCALE); 772 RETV(NWBDRY, 0); 773 break; 774 case CHR('Z'): 775 RETV(SEND, 0); 776 break; 777 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): 778 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): 779 case CHR('9'): 780 save = v->now; 781 v->now--; /* put first digit back */ 782 c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ 783 if (ISERR()) 784 FAILW(REG_EESCAPE); 785 /* ugly heuristic (first test is "exactly 1 digit?") */ 786 if (v->now-save == 0 || ((int)c > 0 && (int)c <= v->nsubexp)) { 787 NOTE(REG_UBACKREF); 788 RETV(BACKREF, (chr)c); 789 } 790 /* oops, doesn't look like it's a backref after all... */ 791 v->now = save; 792 /* and fall through into octal number */ 793 case CHR('0'): 794 NOTE(REG_UUNPORT); 795 v->now--; /* put first digit back */ 796 c = lexdigits(v, 8, 1, 3); 797 if (ISERR()) 798 FAILW(REG_EESCAPE); 799 RETV(PLAIN, c); 800 break; 801 default: 802 assert(iscalpha(c)); 803 FAILW(REG_EESCAPE); /* unknown alphabetic escape */ 804 break; 805 } 806 assert(NOTREACHED); 807} 808 809/* 810 - lexdigits - slurp up digits and return chr value 811 ^ static chr lexdigits(struct vars *, int, int, int); 812 */ 813static chr /* chr value; errors signalled via ERR */ 814lexdigits(v, base, minlen, maxlen) 815struct vars *v; 816int base; 817int minlen; 818int maxlen; 819{ 820 uchr n; /* unsigned to avoid overflow misbehavior */ 821 int len; 822 chr c; 823 int d; 824 CONST uchr ub = (uchr) base; 825 826 n = 0; 827 for (len = 0; len < maxlen && !ATEOS(); len++) { 828 c = *v->now++; 829 switch (c) { 830 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): 831 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): 832 case CHR('8'): case CHR('9'): 833 d = DIGITVAL(c); 834 break; 835 case CHR('a'): case CHR('A'): d = 10; break; 836 case CHR('b'): case CHR('B'): d = 11; break; 837 case CHR('c'): case CHR('C'): d = 12; break; 838 case CHR('d'): case CHR('D'): d = 13; break; 839 case CHR('e'): case CHR('E'): d = 14; break; 840 case CHR('f'): case CHR('F'): d = 15; break; 841 default: 842 v->now--; /* oops, not a digit at all */ 843 d = -1; 844 break; 845 } 846 847 if (d >= base) { /* not a plausible digit */ 848 v->now--; 849 d = -1; 850 } 851 if (d < 0) 852 break; /* NOTE BREAK OUT */ 853 n = n*ub + (uchr)d; 854 } 855 if (len < minlen) 856 ERR(REG_EESCAPE); 857 858 return (chr)n; 859} 860 861/* 862 - brenext - get next BRE token 863 * This is much like EREs except for all the stupid backslashes and the 864 * context-dependency of some things. 865 ^ static int brenext(struct vars *, pchr); 866 */ 867static int /* 1 normal, 0 failure */ 868brenext(v, pc) 869struct vars *v; 870pchr pc; 871{ 872 chr c = (chr)pc; 873 874 switch (c) { 875 case CHR('*'): 876 if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) 877 RETV(PLAIN, c); 878 RET('*'); 879 break; 880 case CHR('['): 881 if (HAVE(6) && *(v->now+0) == CHR('[') && 882 *(v->now+1) == CHR(':') && 883 (*(v->now+2) == CHR('<') || 884 *(v->now+2) == CHR('>')) && 885 *(v->now+3) == CHR(':') && 886 *(v->now+4) == CHR(']') && 887 *(v->now+5) == CHR(']')) { 888 c = *(v->now+2); 889 v->now += 6; 890 NOTE(REG_UNONPOSIX); 891 RET((c == CHR('<')) ? '<' : '>'); 892 } 893 INTOCON(L_BRACK); 894 if (NEXT1('^')) { 895 v->now++; 896 RETV('[', 0); 897 } 898 RETV('[', 1); 899 break; 900 case CHR('.'): 901 RET('.'); 902 break; 903 case CHR('^'): 904 if (LASTTYPE(EMPTY)) 905 RET('^'); 906 if (LASTTYPE('(')) { 907 NOTE(REG_UUNSPEC); 908 RET('^'); 909 } 910 RETV(PLAIN, c); 911 break; 912 case CHR('$'): 913 if (v->cflags®_EXPANDED) 914 skip(v); 915 if (ATEOS()) 916 RET('$'); 917 if (NEXT2('\\', ')')) { 918 NOTE(REG_UUNSPEC); 919 RET('$'); 920 } 921 RETV(PLAIN, c); 922 break; 923 case CHR('\\'): 924 break; /* see below */ 925 default: 926 RETV(PLAIN, c); 927 break; 928 } 929 930 assert(c == CHR('\\')); 931 932 if (ATEOS()) 933 FAILW(REG_EESCAPE); 934 935 c = *v->now++; 936 switch (c) { 937 case CHR('{'): 938 INTOCON(L_BBND); 939 NOTE(REG_UBOUNDS); 940 RET('{'); 941 break; 942 case CHR('('): 943 RETV('(', 1); 944 break; 945 case CHR(')'): 946 RETV(')', c); 947 break; 948 case CHR('<'): 949 NOTE(REG_UNONPOSIX); 950 RET('<'); 951 break; 952 case CHR('>'): 953 NOTE(REG_UNONPOSIX); 954 RET('>'); 955 break; 956 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): 957 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): 958 case CHR('9'): 959 NOTE(REG_UBACKREF); 960 RETV(BACKREF, (chr)DIGITVAL(c)); 961 break; 962 default: 963 if (iscalnum(c)) { 964 NOTE(REG_UBSALNUM); 965 NOTE(REG_UUNSPEC); 966 } 967 RETV(PLAIN, c); 968 break; 969 } 970 971 assert(NOTREACHED); 972} 973 974/* 975 - skip - skip white space and comments in expanded form 976 ^ static VOID skip(struct vars *); 977 */ 978static VOID 979skip(v) 980struct vars *v; 981{ 982 chr *start = v->now; 983 984 assert(v->cflags®_EXPANDED); 985 986 for (;;) { 987 while (!ATEOS() && iscspace(*v->now)) 988 v->now++; 989 if (ATEOS() || *v->now != CHR('#')) 990 break; /* NOTE BREAK OUT */ 991 assert(NEXT1('#')); 992 while (!ATEOS() && *v->now != CHR('\n')) 993 v->now++; 994 /* leave the newline to be picked up by the iscspace loop */ 995 } 996 997 if (v->now != start) 998 NOTE(REG_UNONPOSIX); 999} 1000 1001/* 1002 - newline - return the chr for a newline 1003 * This helps confine use of CHR to this source file. 1004 ^ static chr newline(NOPARMS); 1005 */ 1006static chr 1007newline() 1008{ 1009 return CHR('\n'); 1010} 1011 1012/* 1013 - ch - return the chr sequence for regc_locale.c's fake collating element ch 1014 * This helps confine use of CHR to this source file. Beware that the caller 1015 * knows how long the sequence is. 1016 ^ #ifdef REG_DEBUG 1017 ^ static chr *ch(NOPARMS); 1018 ^ #endif 1019 */ 1020#ifdef REG_DEBUG 1021static chr * 1022ch() 1023{ 1024 static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') }; 1025 1026 return chstr; 1027} 1028#endif 1029 1030/* 1031 - chrnamed - return the chr known by a given (chr string) name 1032 * The code is a bit clumsy, but this routine gets only such specialized 1033 * use that it hardly matters. 1034 ^ static chr chrnamed(struct vars *, chr *, chr *, pchr); 1035 */ 1036static chr 1037chrnamed(v, startp, endp, lastresort) 1038struct vars *v; 1039chr *startp; /* start of name */ 1040chr *endp; /* just past end of name */ 1041pchr lastresort; /* what to return if name lookup fails */ 1042{ 1043 celt c; 1044 int errsave; 1045 int e; 1046 struct cvec *cv; 1047 1048 errsave = v->err; 1049 v->err = 0; 1050 c = element(v, startp, endp); 1051 e = v->err; 1052 v->err = errsave; 1053 1054 if (e != 0) 1055 return (chr)lastresort; 1056 1057 cv = range(v, c, c, 0); 1058 if (cv->nchrs == 0) 1059 return (chr)lastresort; 1060 return cv->chrs[0]; 1061} 1062