1/*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 */ 39 40#if defined(LIBC_SCCS) && !defined(lint) 41static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 42#endif /* LIBC_SCCS and not lint */ 43 44#include <sys/types.h> 45#include <stdio.h> 46#include <string.h> 47#include <ctype.h> 48#include <limits.h>
|
49#include <locale.h>
|
49#include <stdlib.h> 50#include <regex.h> 51
|
52#include "collate.h" 53 |
54#include "utils.h" 55#include "regex2.h" 56 57#include "cclass.h" 58#include "cname.h" 59 60/* 61 * parse structure, passed up and down to avoid global variables and 62 * other clumsinesses 63 */ 64struct parse { 65 char *next; /* next character in RE */ 66 char *end; /* end of string (-> NUL normally) */ 67 int error; /* has an error been seen? */ 68 sop *strip; /* malloced strip */ 69 sopno ssize; /* malloced strip size (allocated) */ 70 sopno slen; /* malloced strip length (used) */ 71 int ncsalloc; /* number of csets allocated */ 72 struct re_guts *g; 73# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 74 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 75 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 76}; 77 78/* ========= begin header generated by ./mkh ========= */ 79#ifdef __cplusplus 80extern "C" { 81#endif 82 83/* === regcomp.c === */ 84static void p_ere __P((struct parse *p, int stop)); 85static void p_ere_exp __P((struct parse *p)); 86static void p_str __P((struct parse *p)); 87static void p_bre __P((struct parse *p, int end1, int end2)); 88static int p_simp_re __P((struct parse *p, int starordinary)); 89static int p_count __P((struct parse *p)); 90static void p_bracket __P((struct parse *p)); 91static void p_b_term __P((struct parse *p, cset *cs)); 92static void p_b_cclass __P((struct parse *p, cset *cs)); 93static void p_b_eclass __P((struct parse *p, cset *cs)); 94static char p_b_symbol __P((struct parse *p)); 95static char p_b_coll_elem __P((struct parse *p, int endc)); 96static char othercase __P((int ch)); 97static void bothcases __P((struct parse *p, int ch)); 98static void ordinary __P((struct parse *p, int ch)); 99static void nonnewline __P((struct parse *p)); 100static void repeat __P((struct parse *p, sopno start, int from, int to)); 101static int seterr __P((struct parse *p, int e)); 102static cset *allocset __P((struct parse *p)); 103static void freeset __P((struct parse *p, cset *cs)); 104static int freezeset __P((struct parse *p, cset *cs)); 105static int firstch __P((struct parse *p, cset *cs)); 106static int nch __P((struct parse *p, cset *cs)); 107static void mcadd __P((struct parse *p, cset *cs, char *cp)); 108#if used 109static void mcsub __P((cset *cs, char *cp)); 110static int mcin __P((cset *cs, char *cp)); 111static char *mcfind __P((cset *cs, char *cp)); 112#endif 113static void mcinvert __P((struct parse *p, cset *cs)); 114static void mccase __P((struct parse *p, cset *cs)); 115static int isinsets __P((struct re_guts *g, int c)); 116static int samesets __P((struct re_guts *g, int c1, int c2)); 117static void categorize __P((struct parse *p, struct re_guts *g)); 118static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 119static void doemit __P((struct parse *p, sop op, size_t opnd)); 120static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 121static void dofwd __P((struct parse *p, sopno pos, sop value)); 122static void enlarge __P((struct parse *p, sopno size)); 123static void stripsnug __P((struct parse *p, struct re_guts *g)); 124static void findmust __P((struct parse *p, struct re_guts *g)); 125static sopno pluscount __P((struct parse *p, struct re_guts *g)); 126 127#ifdef __cplusplus 128} 129#endif 130/* ========= end header generated by ./mkh ========= */ 131 132static char nuls[10]; /* place to point scanner in event of error */ 133 134/* 135 * macros for use with parse structure 136 * BEWARE: these know that the parse structure is named `p' !!! 137 */ 138#define PEEK() (*p->next) 139#define PEEK2() (*(p->next+1)) 140#define MORE() (p->next < p->end) 141#define MORE2() (p->next+1 < p->end) 142#define SEE(c) (MORE() && PEEK() == (c)) 143#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 144#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 145#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 146#define NEXT() (p->next++) 147#define NEXT2() (p->next += 2) 148#define NEXTn(n) (p->next += (n)) 149#define GETNEXT() (*p->next++) 150#define SETERROR(e) seterr(p, (e)) 151#define REQUIRE(co, e) ((co) || SETERROR(e)) 152#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 153#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 154#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 155#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 156#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 157#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 158#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 159#define HERE() (p->slen) 160#define THERE() (p->slen - 1) 161#define THERETHERE() (p->slen - 2) 162#define DROP(n) (p->slen -= (n)) 163 164#ifndef NDEBUG 165static int never = 0; /* for use in asserts; shuts lint up */ 166#else 167#define never 0 /* some <assert.h>s have bugs too */ 168#endif 169 170/* 171 - regcomp - interface for parser and compilation 172 = extern int regcomp(regex_t *, const char *, int); 173 = #define REG_BASIC 0000 174 = #define REG_EXTENDED 0001 175 = #define REG_ICASE 0002 176 = #define REG_NOSUB 0004 177 = #define REG_NEWLINE 0010 178 = #define REG_NOSPEC 0020 179 = #define REG_PEND 0040 180 = #define REG_DUMP 0200 181 */ 182int /* 0 success, otherwise REG_something */ 183regcomp(preg, pattern, cflags) 184regex_t *preg; 185const char *pattern; 186int cflags; 187{ 188 struct parse pa; 189 register struct re_guts *g; 190 register struct parse *p = &pa; 191 register int i; 192 register size_t len; 193#ifdef REDEBUG 194# define GOODFLAGS(f) (f) 195#else 196# define GOODFLAGS(f) ((f)&~REG_DUMP) 197#endif 198 199 cflags = GOODFLAGS(cflags); 200 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 201 return(REG_INVARG); 202 203 if (cflags®_PEND) { 204 if (preg->re_endp < pattern) 205 return(REG_INVARG); 206 len = preg->re_endp - pattern; 207 } else 208 len = strlen((char *)pattern); 209 210 /* do the mallocs early so failure handling is easy */ 211 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 212 (NC-1)*sizeof(cat_t)); 213 if (g == NULL) 214 return(REG_ESPACE); 215 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 216 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 217 p->slen = 0; 218 if (p->strip == NULL) { 219 free((char *)g); 220 return(REG_ESPACE); 221 } 222 223 /* set things up */ 224 p->g = g; 225 p->next = (char *)pattern; /* convenience; we do not modify it */ 226 p->end = p->next + len; 227 p->error = 0; 228 p->ncsalloc = 0; 229 for (i = 0; i < NPAREN; i++) { 230 p->pbegin[i] = 0; 231 p->pend[i] = 0; 232 } 233 g->csetsize = NC; 234 g->sets = NULL; 235 g->setbits = NULL; 236 g->ncsets = 0; 237 g->cflags = cflags; 238 g->iflags = 0; 239 g->nbol = 0; 240 g->neol = 0; 241 g->must = NULL; 242 g->mlen = 0; 243 g->nsub = 0; 244 g->ncategories = 1; /* category 0 is "everything else" */ 245 g->categories = &g->catspace[-(CHAR_MIN)]; 246 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 247 g->backrefs = 0; 248 249 /* do it */ 250 EMIT(OEND, 0); 251 g->firststate = THERE(); 252 if (cflags®_EXTENDED) 253 p_ere(p, OUT); 254 else if (cflags®_NOSPEC) 255 p_str(p); 256 else 257 p_bre(p, OUT, OUT); 258 EMIT(OEND, 0); 259 g->laststate = THERE(); 260 261 /* tidy up loose ends and fill things in */ 262 categorize(p, g); 263 stripsnug(p, g); 264 findmust(p, g); 265 g->nplus = pluscount(p, g); 266 g->magic = MAGIC2; 267 preg->re_nsub = g->nsub; 268 preg->re_g = g; 269 preg->re_magic = MAGIC1; 270#ifndef REDEBUG 271 /* not debugging, so can't rely on the assert() in regexec() */ 272 if (g->iflags&BAD) 273 SETERROR(REG_ASSERT); 274#endif 275 276 /* win or lose, we're done */ 277 if (p->error != 0) /* lose */ 278 regfree(preg); 279 return(p->error); 280} 281 282/* 283 - p_ere - ERE parser top level, concatenation and alternation 284 == static void p_ere(register struct parse *p, int stop); 285 */ 286static void 287p_ere(p, stop) 288register struct parse *p; 289int stop; /* character this ERE should end at */ 290{ 291 register char c; 292 register sopno prevback; 293 register sopno prevfwd; 294 register sopno conc; 295 register int first = 1; /* is this the first alternative? */ 296 297 for (;;) { 298 /* do a bunch of concatenated expressions */ 299 conc = HERE(); 300 while (MORE() && (c = PEEK()) != '|' && c != stop) 301 p_ere_exp(p); 302 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 303 304 if (!EAT('|')) 305 break; /* NOTE BREAK OUT */ 306 307 if (first) { 308 INSERT(OCH_, conc); /* offset is wrong */ 309 prevfwd = conc; 310 prevback = conc; 311 first = 0; 312 } 313 ASTERN(OOR1, prevback); 314 prevback = THERE(); 315 AHEAD(prevfwd); /* fix previous offset */ 316 prevfwd = HERE(); 317 EMIT(OOR2, 0); /* offset is very wrong */ 318 } 319 320 if (!first) { /* tail-end fixups */ 321 AHEAD(prevfwd); 322 ASTERN(O_CH, prevback); 323 } 324 325 assert(!MORE() || SEE(stop)); 326} 327 328/* 329 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 330 == static void p_ere_exp(register struct parse *p); 331 */ 332static void 333p_ere_exp(p) 334register struct parse *p; 335{ 336 register char c; 337 register sopno pos; 338 register int count; 339 register int count2; 340 register sopno subno; 341 int wascaret = 0; 342 343 assert(MORE()); /* caller should have ensured this */ 344 c = GETNEXT(); 345 346 pos = HERE(); 347 switch (c) { 348 case '(': 349 (void)REQUIRE(MORE(), REG_EPAREN); 350 p->g->nsub++; 351 subno = p->g->nsub; 352 if (subno < NPAREN) 353 p->pbegin[subno] = HERE(); 354 EMIT(OLPAREN, subno); 355 if (!SEE(')')) 356 p_ere(p, ')'); 357 if (subno < NPAREN) { 358 p->pend[subno] = HERE(); 359 assert(p->pend[subno] != 0); 360 } 361 EMIT(ORPAREN, subno); 362 (void)MUSTEAT(')', REG_EPAREN); 363 break; 364#ifndef POSIX_MISTAKE 365 case ')': /* happens only if no current unmatched ( */ 366 /* 367 * You may ask, why the ifndef? Because I didn't notice 368 * this until slightly too late for 1003.2, and none of the 369 * other 1003.2 regular-expression reviewers noticed it at 370 * all. So an unmatched ) is legal POSIX, at least until 371 * we can get it fixed. 372 */ 373 SETERROR(REG_EPAREN); 374 break; 375#endif 376 case '^': 377 EMIT(OBOL, 0); 378 p->g->iflags |= USEBOL; 379 p->g->nbol++; 380 wascaret = 1; 381 break; 382 case '$': 383 EMIT(OEOL, 0); 384 p->g->iflags |= USEEOL; 385 p->g->neol++; 386 break; 387 case '|': 388 SETERROR(REG_EMPTY); 389 break; 390 case '*': 391 case '+': 392 case '?': 393 SETERROR(REG_BADRPT); 394 break; 395 case '.': 396 if (p->g->cflags®_NEWLINE) 397 nonnewline(p); 398 else 399 EMIT(OANY, 0); 400 break; 401 case '[': 402 p_bracket(p); 403 break; 404 case '\\': 405 (void)REQUIRE(MORE(), REG_EESCAPE); 406 c = GETNEXT(); 407 ordinary(p, c); 408 break; 409 case '{': /* okay as ordinary except if digit follows */ 410 (void)REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 411 /* FALLTHROUGH */ 412 default: 413 ordinary(p, c); 414 break; 415 } 416 417 if (!MORE()) 418 return; 419 c = PEEK(); 420 /* we call { a repetition if followed by a digit */ 421 if (!( c == '*' || c == '+' || c == '?' || 422 (c == '{' && MORE2() && isdigit(PEEK2())) )) 423 return; /* no repetition, we're done */ 424 NEXT(); 425 426 (void)REQUIRE(!wascaret, REG_BADRPT); 427 switch (c) { 428 case '*': /* implemented as +? */ 429 /* this case does not require the (y|) trick, noKLUDGE */ 430 INSERT(OPLUS_, pos); 431 ASTERN(O_PLUS, pos); 432 INSERT(OQUEST_, pos); 433 ASTERN(O_QUEST, pos); 434 break; 435 case '+': 436 INSERT(OPLUS_, pos); 437 ASTERN(O_PLUS, pos); 438 break; 439 case '?': 440 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 441 INSERT(OCH_, pos); /* offset slightly wrong */ 442 ASTERN(OOR1, pos); /* this one's right */ 443 AHEAD(pos); /* fix the OCH_ */ 444 EMIT(OOR2, 0); /* offset very wrong... */ 445 AHEAD(THERE()); /* ...so fix it */ 446 ASTERN(O_CH, THERETHERE()); 447 break; 448 case '{': 449 count = p_count(p); 450 if (EAT(',')) { 451 if (isdigit(PEEK())) { 452 count2 = p_count(p); 453 (void)REQUIRE(count <= count2, REG_BADBR); 454 } else /* single number with comma */ 455 count2 = INFINITY; 456 } else /* just a single number */ 457 count2 = count; 458 repeat(p, pos, count, count2); 459 if (!EAT('}')) { /* error heuristics */ 460 while (MORE() && PEEK() != '}') 461 NEXT(); 462 (void)REQUIRE(MORE(), REG_EBRACE); 463 SETERROR(REG_BADBR); 464 } 465 break; 466 } 467 468 if (!MORE()) 469 return; 470 c = PEEK(); 471 if (!( c == '*' || c == '+' || c == '?' || 472 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 473 return; 474 SETERROR(REG_BADRPT); 475} 476 477/* 478 - p_str - string (no metacharacters) "parser" 479 == static void p_str(register struct parse *p); 480 */ 481static void 482p_str(p) 483register struct parse *p; 484{ 485 (void)REQUIRE(MORE(), REG_EMPTY); 486 while (MORE()) 487 ordinary(p, GETNEXT()); 488} 489 490/* 491 - p_bre - BRE parser top level, anchoring and concatenation 492 == static void p_bre(register struct parse *p, register int end1, \ 493 == register int end2); 494 * Giving end1 as OUT essentially eliminates the end1/end2 check. 495 * 496 * This implementation is a bit of a kludge, in that a trailing $ is first 497 * taken as an ordinary character and then revised to be an anchor. The 498 * only undesirable side effect is that '$' gets included as a character 499 * category in such cases. This is fairly harmless; not worth fixing. 500 * The amount of lookahead needed to avoid this kludge is excessive. 501 */ 502static void 503p_bre(p, end1, end2) 504register struct parse *p; 505register int end1; /* first terminating character */ 506register int end2; /* second terminating character */ 507{ 508 register sopno start = HERE(); 509 register int first = 1; /* first subexpression? */ 510 register int wasdollar = 0; 511 512 if (EAT('^')) { 513 EMIT(OBOL, 0); 514 p->g->iflags |= USEBOL; 515 p->g->nbol++; 516 } 517 while (MORE() && !SEETWO(end1, end2)) { 518 wasdollar = p_simp_re(p, first); 519 first = 0; 520 } 521 if (wasdollar) { /* oops, that was a trailing anchor */ 522 DROP(1); 523 EMIT(OEOL, 0); 524 p->g->iflags |= USEEOL; 525 p->g->neol++; 526 } 527 528 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 529} 530 531/* 532 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 533 == static int p_simp_re(register struct parse *p, int starordinary); 534 */ 535static int /* was the simple RE an unbackslashed $? */ 536p_simp_re(p, starordinary) 537register struct parse *p; 538int starordinary; /* is a leading * an ordinary character? */ 539{ 540 register int c; 541 register int count; 542 register int count2; 543 register sopno pos; 544 register int i; 545 register sopno subno; 546# define BACKSL (1<<CHAR_BIT) 547 548 pos = HERE(); /* repetion op, if any, covers from here */ 549 550 assert(MORE()); /* caller should have ensured this */ 551 c = GETNEXT(); 552 if (c == '\\') { 553 (void)REQUIRE(MORE(), REG_EESCAPE); 554 c = BACKSL | (unsigned char)GETNEXT(); 555 } 556 switch (c) { 557 case '.': 558 if (p->g->cflags®_NEWLINE) 559 nonnewline(p); 560 else 561 EMIT(OANY, 0); 562 break; 563 case '[': 564 p_bracket(p); 565 break; 566 case BACKSL|'{': 567 SETERROR(REG_BADRPT); 568 break; 569 case BACKSL|'(': 570 p->g->nsub++; 571 subno = p->g->nsub; 572 if (subno < NPAREN) 573 p->pbegin[subno] = HERE(); 574 EMIT(OLPAREN, subno); 575 /* the MORE here is an error heuristic */ 576 if (MORE() && !SEETWO('\\', ')')) 577 p_bre(p, '\\', ')'); 578 if (subno < NPAREN) { 579 p->pend[subno] = HERE(); 580 assert(p->pend[subno] != 0); 581 } 582 EMIT(ORPAREN, subno); 583 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 584 break; 585 case BACKSL|')': /* should not get here -- must be user */ 586 case BACKSL|'}': 587 SETERROR(REG_EPAREN); 588 break; 589 case BACKSL|'1': 590 case BACKSL|'2': 591 case BACKSL|'3': 592 case BACKSL|'4': 593 case BACKSL|'5': 594 case BACKSL|'6': 595 case BACKSL|'7': 596 case BACKSL|'8': 597 case BACKSL|'9': 598 i = (c&~BACKSL) - '0'; 599 assert(i < NPAREN); 600 if (p->pend[i] != 0) { 601 assert(i <= p->g->nsub); 602 EMIT(OBACK_, i); 603 assert(p->pbegin[i] != 0); 604 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 605 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 606 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 607 EMIT(O_BACK, i); 608 } else 609 SETERROR(REG_ESUBREG); 610 p->g->backrefs = 1; 611 break; 612 case '*': 613 (void)REQUIRE(starordinary, REG_BADRPT); 614 /* FALLTHROUGH */ 615 default: 616 ordinary(p, c &~ BACKSL); 617 break; 618 } 619 620 if (EAT('*')) { /* implemented as +? */ 621 /* this case does not require the (y|) trick, noKLUDGE */ 622 INSERT(OPLUS_, pos); 623 ASTERN(O_PLUS, pos); 624 INSERT(OQUEST_, pos); 625 ASTERN(O_QUEST, pos); 626 } else if (EATTWO('\\', '{')) { 627 count = p_count(p); 628 if (EAT(',')) { 629 if (MORE() && isdigit(PEEK())) { 630 count2 = p_count(p); 631 (void)REQUIRE(count <= count2, REG_BADBR); 632 } else /* single number with comma */ 633 count2 = INFINITY; 634 } else /* just a single number */ 635 count2 = count; 636 repeat(p, pos, count, count2); 637 if (!EATTWO('\\', '}')) { /* error heuristics */ 638 while (MORE() && !SEETWO('\\', '}')) 639 NEXT(); 640 (void)REQUIRE(MORE(), REG_EBRACE); 641 SETERROR(REG_BADBR); 642 } 643 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 644 return(1); 645 646 return(0); 647} 648 649/* 650 - p_count - parse a repetition count 651 == static int p_count(register struct parse *p); 652 */ 653static int /* the value */ 654p_count(p) 655register struct parse *p; 656{ 657 register int count = 0; 658 register int ndigits = 0; 659 660 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 661 count = count*10 + (GETNEXT() - '0'); 662 ndigits++; 663 } 664 665 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 666 return(count); 667} 668 669/* 670 - p_bracket - parse a bracketed character list 671 == static void p_bracket(register struct parse *p); 672 * 673 * Note a significant property of this code: if the allocset() did SETERROR, 674 * no set operations are done. 675 */ 676static void 677p_bracket(p) 678register struct parse *p; 679{ 680 register cset *cs = allocset(p); 681 register int invert = 0; 682 683 /* Dept of Truly Sickening Special-Case Kludges */ 684 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 685 EMIT(OBOW, 0); 686 NEXTn(6); 687 return; 688 } 689 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 690 EMIT(OEOW, 0); 691 NEXTn(6); 692 return; 693 } 694 695 if (EAT('^')) 696 invert++; /* make note to invert set at end */ 697 if (EAT(']')) 698 CHadd(cs, ']'); 699 else if (EAT('-')) 700 CHadd(cs, '-'); 701 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 702 p_b_term(p, cs); 703 if (EAT('-')) 704 CHadd(cs, '-'); 705 (void)MUSTEAT(']', REG_EBRACK); 706 707 if (p->error != 0) /* don't mess things up further */ 708 return; 709 710 if (p->g->cflags®_ICASE) { 711 register int i; 712 register int ci; 713 714 for (i = p->g->csetsize - 1; i >= 0; i--) 715 if (CHIN(cs, i) && isalpha(i)) { 716 ci = othercase(i); 717 if (ci != i) 718 CHadd(cs, ci); 719 } 720 if (cs->multis != NULL) 721 mccase(p, cs); 722 } 723 if (invert) { 724 register int i; 725 726 for (i = p->g->csetsize - 1; i >= 0; i--) 727 if (CHIN(cs, i)) 728 CHsub(cs, i); 729 else 730 CHadd(cs, i); 731 if (p->g->cflags®_NEWLINE) 732 CHsub(cs, '\n'); 733 if (cs->multis != NULL) 734 mcinvert(p, cs); 735 } 736 737 assert(cs->multis == NULL); /* xxx */ 738 739 if (nch(p, cs) == 1) { /* optimize singleton sets */ 740 ordinary(p, firstch(p, cs)); 741 freeset(p, cs); 742 } else 743 EMIT(OANYOF, freezeset(p, cs)); 744} 745 746/* 747 - p_b_term - parse one term of a bracketed character list 748 == static void p_b_term(register struct parse *p, register cset *cs); 749 */ 750static void 751p_b_term(p, cs) 752register struct parse *p; 753register cset *cs; 754{ 755 register char c; 756 register char start, finish; 757 register int i; 758 759 /* classify what we've got */ 760 switch ((MORE()) ? PEEK() : '\0') { 761 case '[': 762 c = (MORE2()) ? PEEK2() : '\0'; 763 break; 764 case '-': 765 SETERROR(REG_ERANGE); 766 return; /* NOTE RETURN */ 767 break; 768 default: 769 c = '\0'; 770 break; 771 } 772 773 switch (c) { 774 case ':': /* character class */ 775 NEXT2(); 776 (void)REQUIRE(MORE(), REG_EBRACK); 777 c = PEEK(); 778 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 779 p_b_cclass(p, cs); 780 (void)REQUIRE(MORE(), REG_EBRACK); 781 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 782 break; 783 case '=': /* equivalence class */ 784 NEXT2(); 785 (void)REQUIRE(MORE(), REG_EBRACK); 786 c = PEEK(); 787 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 788 p_b_eclass(p, cs); 789 (void)REQUIRE(MORE(), REG_EBRACK); 790 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 791 break; 792 default: /* symbol, ordinary character, or range */ 793/* xxx revision needed for multichar stuff */ 794 start = p_b_symbol(p); 795 if (SEE('-') && MORE2() && PEEK2() != ']') { 796 /* range */ 797 NEXT(); 798 if (EAT('-')) 799 finish = '-'; 800 else 801 finish = p_b_symbol(p); 802 } else 803 finish = start; 804 if (start == finish) 805 CHadd(cs, start); 806 else {
|
806 (void)REQUIRE(collate_range_cmp(start, finish) <= 0, REG_ERANGE);
|
807 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); |
808 for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
|
808 if ( collate_range_cmp(start, i) <= 0
809 && collate_range_cmp(i, finish) <= 0
|
809 if ( __collate_range_cmp(start, i) <= 0 810 && __collate_range_cmp(i, finish) <= 0 |
811 ) 812 CHadd(cs, i); 813 } 814 } 815 break; 816 } 817} 818 819/* 820 - p_b_cclass - parse a character-class name and deal with it 821 == static void p_b_cclass(register struct parse *p, register cset *cs); 822 */ 823static void 824p_b_cclass(p, cs) 825register struct parse *p; 826register cset *cs; 827{ 828 register int c; 829 register char *sp = p->next; 830 register struct cclass *cp; 831 register size_t len; 832 833 while (MORE() && isalpha((uch)PEEK())) 834 NEXT(); 835 len = p->next - sp; 836 for (cp = cclasses; cp->name != NULL; cp++) 837 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 838 break; 839 if (cp->name == NULL) { 840 /* oops, didn't find it */ 841 SETERROR(REG_ECTYPE); 842 return; 843 } 844 845 switch (cp->fidx) { 846 case CALNUM: 847 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 848 if (isalnum((uch)c)) 849 CHadd(cs, c); 850 break; 851 case CALPHA: 852 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 853 if (isalpha((uch)c)) 854 CHadd(cs, c); 855 break; 856 case CBLANK: 857 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 858 if (isblank((uch)c)) 859 CHadd(cs, c); 860 break; 861 case CCNTRL: 862 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 863 if (iscntrl((uch)c)) 864 CHadd(cs, c); 865 break; 866 case CDIGIT: 867 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 868 if (isdigit((uch)c)) 869 CHadd(cs, c); 870 break; 871 case CGRAPH: 872 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 873 if (isgraph((uch)c)) 874 CHadd(cs, c); 875 break; 876 case CLOWER: 877 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 878 if (islower((uch)c)) 879 CHadd(cs, c); 880 break; 881 case CPRINT: 882 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 883 if (isprint((uch)c)) 884 CHadd(cs, c); 885 break; 886 case CPUNCT: 887 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 888 if (ispunct((uch)c)) 889 CHadd(cs, c); 890 break; 891 case CSPACE: 892 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 893 if (isspace((uch)c)) 894 CHadd(cs, c); 895 break; 896 case CUPPER: 897 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 898 if (isupper((uch)c)) 899 CHadd(cs, c); 900 break; 901 case CXDIGIT: 902 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 903 if (isxdigit((uch)c)) 904 CHadd(cs, c); 905 break; 906 } 907#if 0 908 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 909 MCadd(p, cs, u); 910#endif 911} 912 913/* 914 - p_b_eclass - parse an equivalence-class name and deal with it 915 == static void p_b_eclass(register struct parse *p, register cset *cs); 916 * 917 * This implementation is incomplete. xxx 918 */ 919static void 920p_b_eclass(p, cs) 921register struct parse *p; 922register cset *cs; 923{ 924 register char c; 925 926 c = p_b_coll_elem(p, '='); 927 CHadd(cs, c); 928} 929 930/* 931 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 932 == static char p_b_symbol(register struct parse *p); 933 */ 934static char /* value of symbol */ 935p_b_symbol(p) 936register struct parse *p; 937{ 938 register char value; 939 940 (void)REQUIRE(MORE(), REG_EBRACK); 941 if (!EATTWO('[', '.')) 942 return(GETNEXT()); 943 944 /* collating symbol */ 945 value = p_b_coll_elem(p, '.'); 946 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 947 return(value); 948} 949 950/* 951 - p_b_coll_elem - parse a collating-element name and look it up 952 == static char p_b_coll_elem(register struct parse *p, int endc); 953 */ 954static char /* value of collating element */ 955p_b_coll_elem(p, endc) 956register struct parse *p; 957int endc; /* name ended by endc,']' */ 958{ 959 register char *sp = p->next; 960 register struct cname *cp; 961 register int len; 962 963 while (MORE() && !SEETWO(endc, ']')) 964 NEXT(); 965 if (!MORE()) { 966 SETERROR(REG_EBRACK); 967 return(0); 968 } 969 len = p->next - sp; 970 for (cp = cnames; cp->name != NULL; cp++) 971 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 972 return(cp->code); /* known name */ 973 if (len == 1) 974 return(*sp); /* single character */ 975 SETERROR(REG_ECOLLATE); /* neither */ 976 return(0); 977} 978 979/* 980 - othercase - return the case counterpart of an alphabetic 981 == static char othercase(int ch); 982 */ 983static char /* if no counterpart, return ch */ 984othercase(ch) 985int ch; 986{ 987 ch = (unsigned char)ch; 988 assert(isalpha(ch)); 989 if (isupper(ch)) 990 return(tolower(ch)); 991 else if (islower(ch)) 992 return(toupper(ch)); 993 else /* peculiar, but could happen */ 994 return(ch); 995} 996 997/* 998 - bothcases - emit a dualcase version of a two-case character 999 == static void bothcases(register struct parse *p, int ch); 1000 * 1001 * Boy, is this implementation ever a kludge... 1002 */ 1003static void 1004bothcases(p, ch) 1005register struct parse *p; 1006int ch; 1007{ 1008 register char *oldnext = p->next; 1009 register char *oldend = p->end; 1010 char bracket[3]; 1011 1012 ch = (unsigned char)ch; 1013 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1014 p->next = bracket; 1015 p->end = bracket+2; 1016 bracket[0] = ch; 1017 bracket[1] = ']'; 1018 bracket[2] = '\0'; 1019 p_bracket(p); 1020 assert(p->next == bracket+2); 1021 p->next = oldnext; 1022 p->end = oldend; 1023} 1024 1025/* 1026 - ordinary - emit an ordinary character 1027 == static void ordinary(register struct parse *p, register int ch); 1028 */ 1029static void 1030ordinary(p, ch) 1031register struct parse *p; 1032register int ch; 1033{ 1034 register cat_t *cap = p->g->categories; 1035 1036 if ((p->g->cflags®_ICASE) && isalpha((unsigned char)ch) && othercase(ch) != ch) 1037 bothcases(p, ch); 1038 else { 1039 EMIT(OCHAR, (unsigned char)ch); 1040 if (cap[ch] == 0) 1041 cap[ch] = p->g->ncategories++; 1042 } 1043} 1044 1045/* 1046 - nonnewline - emit REG_NEWLINE version of OANY 1047 == static void nonnewline(register struct parse *p); 1048 * 1049 * Boy, is this implementation ever a kludge... 1050 */ 1051static void 1052nonnewline(p) 1053register struct parse *p; 1054{ 1055 register char *oldnext = p->next; 1056 register char *oldend = p->end; 1057 char bracket[4]; 1058 1059 p->next = bracket; 1060 p->end = bracket+3; 1061 bracket[0] = '^'; 1062 bracket[1] = '\n'; 1063 bracket[2] = ']'; 1064 bracket[3] = '\0'; 1065 p_bracket(p); 1066 assert(p->next == bracket+3); 1067 p->next = oldnext; 1068 p->end = oldend; 1069} 1070 1071/* 1072 - repeat - generate code for a bounded repetition, recursively if needed 1073 == static void repeat(register struct parse *p, sopno start, int from, int to); 1074 */ 1075static void 1076repeat(p, start, from, to) 1077register struct parse *p; 1078sopno start; /* operand from here to end of strip */ 1079int from; /* repeated from this number */ 1080int to; /* to this number of times (maybe INFINITY) */ 1081{ 1082 register sopno finish = HERE(); 1083# define N 2 1084# define INF 3 1085# define REP(f, t) ((f)*8 + (t)) 1086# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1087 register sopno copy; 1088 1089 if (p->error != 0) /* head off possible runaway recursion */ 1090 return; 1091 1092 assert(from <= to); 1093 1094 switch (REP(MAP(from), MAP(to))) { 1095 case REP(0, 0): /* must be user doing this */ 1096 DROP(finish-start); /* drop the operand */ 1097 break; 1098 case REP(0, 1): /* as x{1,1}? */ 1099 case REP(0, N): /* as x{1,n}? */ 1100 case REP(0, INF): /* as x{1,}? */ 1101 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1102 INSERT(OCH_, start); /* offset is wrong... */ 1103 repeat(p, start+1, 1, to); 1104 ASTERN(OOR1, start); 1105 AHEAD(start); /* ... fix it */ 1106 EMIT(OOR2, 0); 1107 AHEAD(THERE()); 1108 ASTERN(O_CH, THERETHERE()); 1109 break; 1110 case REP(1, 1): /* trivial case */ 1111 /* done */ 1112 break; 1113 case REP(1, N): /* as x?x{1,n-1} */ 1114 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1115 INSERT(OCH_, start); 1116 ASTERN(OOR1, start); 1117 AHEAD(start); 1118 EMIT(OOR2, 0); /* offset very wrong... */ 1119 AHEAD(THERE()); /* ...so fix it */ 1120 ASTERN(O_CH, THERETHERE()); 1121 copy = dupl(p, start+1, finish+1); 1122 assert(copy == finish+4); 1123 repeat(p, copy, 1, to-1); 1124 break; 1125 case REP(1, INF): /* as x+ */ 1126 INSERT(OPLUS_, start); 1127 ASTERN(O_PLUS, start); 1128 break; 1129 case REP(N, N): /* as xx{m-1,n-1} */ 1130 copy = dupl(p, start, finish); 1131 repeat(p, copy, from-1, to-1); 1132 break; 1133 case REP(N, INF): /* as xx{n-1,INF} */ 1134 copy = dupl(p, start, finish); 1135 repeat(p, copy, from-1, to); 1136 break; 1137 default: /* "can't happen" */ 1138 SETERROR(REG_ASSERT); /* just in case */ 1139 break; 1140 } 1141} 1142 1143/* 1144 - seterr - set an error condition 1145 == static int seterr(register struct parse *p, int e); 1146 */ 1147static int /* useless but makes type checking happy */ 1148seterr(p, e) 1149register struct parse *p; 1150int e; 1151{ 1152 if (p->error == 0) /* keep earliest error condition */ 1153 p->error = e; 1154 p->next = nuls; /* try to bring things to a halt */ 1155 p->end = nuls; 1156 return(0); /* make the return value well-defined */ 1157} 1158 1159/* 1160 - allocset - allocate a set of characters for [] 1161 == static cset *allocset(register struct parse *p); 1162 */ 1163static cset * 1164allocset(p) 1165register struct parse *p; 1166{ 1167 register int no = p->g->ncsets++; 1168 register size_t nc; 1169 register size_t nbytes; 1170 register cset *cs; 1171 register size_t css = (size_t)p->g->csetsize; 1172 register int i; 1173 1174 if (no >= p->ncsalloc) { /* need another column of space */ 1175 p->ncsalloc += CHAR_BIT; 1176 nc = p->ncsalloc; 1177 assert(nc % CHAR_BIT == 0); 1178 nbytes = nc / CHAR_BIT * css; 1179 if (p->g->sets == NULL) 1180 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1181 else 1182 p->g->sets = (cset *)realloc((char *)p->g->sets, 1183 nc * sizeof(cset)); 1184 if (p->g->setbits == NULL) 1185 p->g->setbits = (uch *)malloc(nbytes); 1186 else { 1187 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1188 nbytes); 1189 /* xxx this isn't right if setbits is now NULL */ 1190 for (i = 0; i < no; i++) 1191 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1192 } 1193 if (p->g->sets != NULL && p->g->setbits != NULL) 1194 (void) memset((char *)p->g->setbits + (nbytes - css), 1195 0, css); 1196 else { 1197 no = 0; 1198 SETERROR(REG_ESPACE); 1199 /* caller's responsibility not to do set ops */ 1200 } 1201 } 1202 1203 assert(p->g->sets != NULL); /* xxx */ 1204 cs = &p->g->sets[no]; 1205 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1206 cs->mask = 1 << ((no) % CHAR_BIT); 1207 cs->hash = 0; 1208 cs->smultis = 0; 1209 cs->multis = NULL; 1210 1211 return(cs); 1212} 1213 1214/* 1215 - freeset - free a now-unused set 1216 == static void freeset(register struct parse *p, register cset *cs); 1217 */ 1218static void 1219freeset(p, cs) 1220register struct parse *p; 1221register cset *cs; 1222{ 1223 register int i; 1224 register cset *top = &p->g->sets[p->g->ncsets]; 1225 register size_t css = (size_t)p->g->csetsize; 1226 1227 for (i = 0; i < css; i++) 1228 CHsub(cs, i); 1229 if (cs == top-1) /* recover only the easy case */ 1230 p->g->ncsets--; 1231} 1232 1233/* 1234 - freezeset - final processing on a set of characters 1235 == static int freezeset(register struct parse *p, register cset *cs); 1236 * 1237 * The main task here is merging identical sets. This is usually a waste 1238 * of time (although the hash code minimizes the overhead), but can win 1239 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1240 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1241 * the same value! 1242 */ 1243static int /* set number */ 1244freezeset(p, cs) 1245register struct parse *p; 1246register cset *cs; 1247{ 1248 register short h = cs->hash; 1249 register int i; 1250 register cset *top = &p->g->sets[p->g->ncsets]; 1251 register cset *cs2; 1252 register size_t css = (size_t)p->g->csetsize; 1253 1254 /* look for an earlier one which is the same */ 1255 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1256 if (cs2->hash == h && cs2 != cs) { 1257 /* maybe */ 1258 for (i = 0; i < css; i++) 1259 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1260 break; /* no */ 1261 if (i == css) 1262 break; /* yes */ 1263 } 1264 1265 if (cs2 < top) { /* found one */ 1266 freeset(p, cs); 1267 cs = cs2; 1268 } 1269 1270 return((int)(cs - p->g->sets)); 1271} 1272 1273/* 1274 - firstch - return first character in a set (which must have at least one) 1275 == static int firstch(register struct parse *p, register cset *cs); 1276 */ 1277static int /* character; there is no "none" value */ 1278firstch(p, cs) 1279register struct parse *p; 1280register cset *cs; 1281{ 1282 register int i; 1283 register size_t css = (size_t)p->g->csetsize; 1284 1285 for (i = 0; i < css; i++) 1286 if (CHIN(cs, i)) 1287 return((unsigned char)i); 1288 assert(never); 1289 return(0); /* arbitrary */ 1290} 1291 1292/* 1293 - nch - number of characters in a set 1294 == static int nch(register struct parse *p, register cset *cs); 1295 */ 1296static int 1297nch(p, cs) 1298register struct parse *p; 1299register cset *cs; 1300{ 1301 register int i; 1302 register size_t css = (size_t)p->g->csetsize; 1303 register int n = 0; 1304 1305 for (i = 0; i < css; i++) 1306 if (CHIN(cs, i)) 1307 n++; 1308 return(n); 1309} 1310 1311/* 1312 - mcadd - add a collating element to a cset 1313 == static void mcadd(register struct parse *p, register cset *cs, \ 1314 == register char *cp); 1315 */ 1316static void 1317mcadd(p, cs, cp) 1318register struct parse *p; 1319register cset *cs; 1320register char *cp; 1321{ 1322 register size_t oldend = cs->smultis; 1323 1324 cs->smultis += strlen(cp) + 1; 1325 if (cs->multis == NULL) 1326 cs->multis = malloc(cs->smultis); 1327 else 1328 cs->multis = realloc(cs->multis, cs->smultis); 1329 if (cs->multis == NULL) { 1330 SETERROR(REG_ESPACE); 1331 return; 1332 } 1333 1334 (void) strcpy(cs->multis + oldend - 1, cp); 1335 cs->multis[cs->smultis - 1] = '\0'; 1336} 1337 1338#if used 1339/* 1340 - mcsub - subtract a collating element from a cset 1341 == static void mcsub(register cset *cs, register char *cp); 1342 */ 1343static void 1344mcsub(cs, cp) 1345register cset *cs; 1346register char *cp; 1347{ 1348 register char *fp = mcfind(cs, cp); 1349 register size_t len = strlen(fp); 1350 1351 assert(fp != NULL); 1352 (void) memmove(fp, fp + len + 1, 1353 cs->smultis - (fp + len + 1 - cs->multis)); 1354 cs->smultis -= len; 1355 1356 if (cs->smultis == 0) { 1357 free(cs->multis); 1358 cs->multis = NULL; 1359 return; 1360 } 1361 1362 cs->multis = realloc(cs->multis, cs->smultis); 1363 assert(cs->multis != NULL); 1364} 1365 1366/* 1367 - mcin - is a collating element in a cset? 1368 == static int mcin(register cset *cs, register char *cp); 1369 */ 1370static int 1371mcin(cs, cp) 1372register cset *cs; 1373register char *cp; 1374{ 1375 return(mcfind(cs, cp) != NULL); 1376} 1377 1378/* 1379 - mcfind - find a collating element in a cset 1380 == static char *mcfind(register cset *cs, register char *cp); 1381 */ 1382static char * 1383mcfind(cs, cp) 1384register cset *cs; 1385register char *cp; 1386{ 1387 register char *p; 1388 1389 if (cs->multis == NULL) 1390 return(NULL); 1391 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1392 if (strcmp(cp, p) == 0) 1393 return(p); 1394 return(NULL); 1395} 1396#endif 1397 1398/* 1399 - mcinvert - invert the list of collating elements in a cset 1400 == static void mcinvert(register struct parse *p, register cset *cs); 1401 * 1402 * This would have to know the set of possibilities. Implementation 1403 * is deferred. 1404 */ 1405static void 1406mcinvert(p, cs) 1407register struct parse *p; 1408register cset *cs; 1409{ 1410 assert(cs->multis == NULL); /* xxx */ 1411} 1412 1413/* 1414 - mccase - add case counterparts of the list of collating elements in a cset 1415 == static void mccase(register struct parse *p, register cset *cs); 1416 * 1417 * This would have to know the set of possibilities. Implementation 1418 * is deferred. 1419 */ 1420static void 1421mccase(p, cs) 1422register struct parse *p; 1423register cset *cs; 1424{ 1425 assert(cs->multis == NULL); /* xxx */ 1426} 1427 1428/* 1429 - isinsets - is this character in any sets? 1430 == static int isinsets(register struct re_guts *g, int c); 1431 */ 1432static int /* predicate */ 1433isinsets(g, c) 1434register struct re_guts *g; 1435int c; 1436{ 1437 register uch *col; 1438 register int i; 1439 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1440 register unsigned uc = (unsigned char)c; 1441 1442 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1443 if (col[uc] != 0) 1444 return(1); 1445 return(0); 1446} 1447 1448/* 1449 - samesets - are these two characters in exactly the same sets? 1450 == static int samesets(register struct re_guts *g, int c1, int c2); 1451 */ 1452static int /* predicate */ 1453samesets(g, c1, c2) 1454register struct re_guts *g; 1455int c1; 1456int c2; 1457{ 1458 register uch *col; 1459 register int i; 1460 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1461 register unsigned uc1 = (unsigned char)c1; 1462 register unsigned uc2 = (unsigned char)c2; 1463 1464 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1465 if (col[uc1] != col[uc2]) 1466 return(0); 1467 return(1); 1468} 1469 1470/* 1471 - categorize - sort out character categories 1472 == static void categorize(struct parse *p, register struct re_guts *g); 1473 */ 1474static void 1475categorize(p, g) 1476struct parse *p; 1477register struct re_guts *g; 1478{ 1479 register cat_t *cats = g->categories; 1480 register int c; 1481 register int c2; 1482 register cat_t cat; 1483 1484 /* avoid making error situations worse */ 1485 if (p->error != 0) 1486 return; 1487 1488 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1489 if (cats[c] == 0 && isinsets(g, c)) { 1490 cat = g->ncategories++; 1491 cats[c] = cat; 1492 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1493 if (cats[c2] == 0 && samesets(g, c, c2)) 1494 cats[c2] = cat; 1495 } 1496} 1497 1498/* 1499 - dupl - emit a duplicate of a bunch of sops 1500 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1501 */ 1502static sopno /* start of duplicate */ 1503dupl(p, start, finish) 1504register struct parse *p; 1505sopno start; /* from here */ 1506sopno finish; /* to this less one */ 1507{ 1508 register sopno ret = HERE(); 1509 register sopno len = finish - start; 1510 1511 assert(finish >= start); 1512 if (len == 0) 1513 return(ret); 1514 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1515 assert(p->ssize >= p->slen + len); 1516 (void) memcpy((char *)(p->strip + p->slen), 1517 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1518 p->slen += len; 1519 return(ret); 1520} 1521 1522/* 1523 - doemit - emit a strip operator 1524 == static void doemit(register struct parse *p, sop op, size_t opnd); 1525 * 1526 * It might seem better to implement this as a macro with a function as 1527 * hard-case backup, but it's just too big and messy unless there are 1528 * some changes to the data structures. Maybe later. 1529 */ 1530static void 1531doemit(p, op, opnd) 1532register struct parse *p; 1533sop op; 1534size_t opnd; 1535{ 1536 /* avoid making error situations worse */ 1537 if (p->error != 0) 1538 return; 1539 1540 /* deal with oversize operands ("can't happen", more or less) */ 1541 assert(opnd < 1<<OPSHIFT); 1542 1543 /* deal with undersized strip */ 1544 if (p->slen >= p->ssize) 1545 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1546 assert(p->slen < p->ssize); 1547 1548 /* finally, it's all reduced to the easy case */ 1549 p->strip[p->slen++] = SOP(op, opnd); 1550} 1551 1552/* 1553 - doinsert - insert a sop into the strip 1554 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1555 */ 1556static void 1557doinsert(p, op, opnd, pos) 1558register struct parse *p; 1559sop op; 1560size_t opnd; 1561sopno pos; 1562{ 1563 register sopno sn; 1564 register sop s; 1565 register int i; 1566 1567 /* avoid making error situations worse */ 1568 if (p->error != 0) 1569 return; 1570 1571 sn = HERE(); 1572 EMIT(op, opnd); /* do checks, ensure space */ 1573 assert(HERE() == sn+1); 1574 s = p->strip[sn]; 1575 1576 /* adjust paren pointers */ 1577 assert(pos > 0); 1578 for (i = 1; i < NPAREN; i++) { 1579 if (p->pbegin[i] >= pos) { 1580 p->pbegin[i]++; 1581 } 1582 if (p->pend[i] >= pos) { 1583 p->pend[i]++; 1584 } 1585 } 1586 1587 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1588 (HERE()-pos-1)*sizeof(sop)); 1589 p->strip[pos] = s; 1590} 1591 1592/* 1593 - dofwd - complete a forward reference 1594 == static void dofwd(register struct parse *p, sopno pos, sop value); 1595 */ 1596static void 1597dofwd(p, pos, value) 1598register struct parse *p; 1599register sopno pos; 1600sop value; 1601{ 1602 /* avoid making error situations worse */ 1603 if (p->error != 0) 1604 return; 1605 1606 assert(value < 1<<OPSHIFT); 1607 p->strip[pos] = OP(p->strip[pos]) | value; 1608} 1609 1610/* 1611 - enlarge - enlarge the strip 1612 == static void enlarge(register struct parse *p, sopno size); 1613 */ 1614static void 1615enlarge(p, size) 1616register struct parse *p; 1617register sopno size; 1618{ 1619 register sop *sp; 1620 1621 if (p->ssize >= size) 1622 return; 1623 1624 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1625 if (sp == NULL) { 1626 SETERROR(REG_ESPACE); 1627 return; 1628 } 1629 p->strip = sp; 1630 p->ssize = size; 1631} 1632 1633/* 1634 - stripsnug - compact the strip 1635 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1636 */ 1637static void 1638stripsnug(p, g) 1639register struct parse *p; 1640register struct re_guts *g; 1641{ 1642 g->nstates = p->slen; 1643 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1644 if (g->strip == NULL) { 1645 SETERROR(REG_ESPACE); 1646 g->strip = p->strip; 1647 } 1648} 1649 1650/* 1651 - findmust - fill in must and mlen with longest mandatory literal string 1652 == static void findmust(register struct parse *p, register struct re_guts *g); 1653 * 1654 * This algorithm could do fancy things like analyzing the operands of | 1655 * for common subsequences. Someday. This code is simple and finds most 1656 * of the interesting cases. 1657 * 1658 * Note that must and mlen got initialized during setup. 1659 */ 1660static void 1661findmust(p, g) 1662struct parse *p; 1663register struct re_guts *g; 1664{ 1665 register sop *scan; 1666 sop *start; 1667 register sop *newstart; 1668 register sopno newlen; 1669 register sop s; 1670 register char *cp; 1671 register sopno i; 1672 1673 /* avoid making error situations worse */ 1674 if (p->error != 0) 1675 return; 1676 1677 /* find the longest OCHAR sequence in strip */ 1678 newlen = 0; 1679 scan = g->strip + 1; 1680 do { 1681 s = *scan++; 1682 switch (OP(s)) { 1683 case OCHAR: /* sequence member */ 1684 if (newlen == 0) /* new sequence */ 1685 newstart = scan - 1; 1686 newlen++; 1687 break; 1688 case OPLUS_: /* things that don't break one */ 1689 case OLPAREN: 1690 case ORPAREN: 1691 break; 1692 case OQUEST_: /* things that must be skipped */ 1693 case OCH_: 1694 scan--; 1695 do { 1696 scan += OPND(s); 1697 s = *scan; 1698 /* assert() interferes w debug printouts */ 1699 if (OP(s) != O_QUEST && OP(s) != O_CH && 1700 OP(s) != OOR2) { 1701 g->iflags |= BAD; 1702 return; 1703 } 1704 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1705 /* fallthrough */ 1706 default: /* things that break a sequence */ 1707 if (newlen > g->mlen) { /* ends one */ 1708 start = newstart; 1709 g->mlen = newlen; 1710 } 1711 newlen = 0; 1712 break; 1713 } 1714 } while (OP(s) != OEND); 1715 1716 if (g->mlen == 0) /* there isn't one */ 1717 return; 1718 1719 /* turn it into a character string */ 1720 g->must = malloc((size_t)g->mlen + 1); 1721 if (g->must == NULL) { /* argh; just forget it */ 1722 g->mlen = 0; 1723 return; 1724 } 1725 cp = g->must; 1726 scan = start; 1727 for (i = g->mlen; i > 0; i--) { 1728 while (OP(s = *scan++) != OCHAR) 1729 continue; 1730 assert(cp < g->must + g->mlen); 1731 *cp++ = (char)OPND(s); 1732 } 1733 assert(cp == g->must + g->mlen); 1734 *cp++ = '\0'; /* just on general principles */ 1735} 1736 1737/* 1738 - pluscount - count + nesting 1739 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1740 */ 1741static sopno /* nesting depth */ 1742pluscount(p, g) 1743struct parse *p; 1744register struct re_guts *g; 1745{ 1746 register sop *scan; 1747 register sop s; 1748 register sopno plusnest = 0; 1749 register sopno maxnest = 0; 1750 1751 if (p->error != 0) 1752 return(0); /* there may not be an OEND */ 1753 1754 scan = g->strip + 1; 1755 do { 1756 s = *scan++; 1757 switch (OP(s)) { 1758 case OPLUS_: 1759 plusnest++; 1760 break; 1761 case O_PLUS: 1762 if (plusnest > maxnest) 1763 maxnest = plusnest; 1764 plusnest--; 1765 break; 1766 } 1767 } while (OP(s) != OEND); 1768 if (plusnest != 0) 1769 g->iflags |= BAD; 1770 return(maxnest); 1771}
|