regcomp.c revision 17508
1/*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 */ 39 40#if defined(LIBC_SCCS) && !defined(lint) 41static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 42#endif /* LIBC_SCCS and not lint */ 43 44#include <sys/types.h> 45#include <stdio.h> 46#include <string.h> 47#include <ctype.h> 48#include <limits.h> 49#include <stdlib.h> 50#include <regex.h> 51 52#include "utils.h" 53#include "regex2.h" 54 55#include "cclass.h" 56#include "cname.h" 57 58/* 59 * parse structure, passed up and down to avoid global variables and 60 * other clumsinesses 61 */ 62struct parse { 63 char *next; /* next character in RE */ 64 char *end; /* end of string (-> NUL normally) */ 65 int error; /* has an error been seen? */ 66 sop *strip; /* malloced strip */ 67 sopno ssize; /* malloced strip size (allocated) */ 68 sopno slen; /* malloced strip length (used) */ 69 int ncsalloc; /* number of csets allocated */ 70 struct re_guts *g; 71# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 72 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 73 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 74}; 75 76/* ========= begin header generated by ./mkh ========= */ 77#ifdef __cplusplus 78extern "C" { 79#endif 80 81/* === regcomp.c === */ 82static void p_ere __P((struct parse *p, int stop)); 83static void p_ere_exp __P((struct parse *p)); 84static void p_str __P((struct parse *p)); 85static void p_bre __P((struct parse *p, int end1, int end2)); 86static int p_simp_re __P((struct parse *p, int starordinary)); 87static int p_count __P((struct parse *p)); 88static void p_bracket __P((struct parse *p)); 89static void p_b_term __P((struct parse *p, cset *cs)); 90static void p_b_cclass __P((struct parse *p, cset *cs)); 91static void p_b_eclass __P((struct parse *p, cset *cs)); 92static char p_b_symbol __P((struct parse *p)); 93static char p_b_coll_elem __P((struct parse *p, int endc)); 94static char othercase __P((int ch)); 95static void bothcases __P((struct parse *p, int ch)); 96static void ordinary __P((struct parse *p, int ch)); 97static void nonnewline __P((struct parse *p)); 98static void repeat __P((struct parse *p, sopno start, int from, int to)); 99static int seterr __P((struct parse *p, int e)); 100static cset *allocset __P((struct parse *p)); 101static void freeset __P((struct parse *p, cset *cs)); 102static int freezeset __P((struct parse *p, cset *cs)); 103static int firstch __P((struct parse *p, cset *cs)); 104static int nch __P((struct parse *p, cset *cs)); 105static void mcadd __P((struct parse *p, cset *cs, char *cp)); 106#if used 107static void mcsub __P((cset *cs, char *cp)); 108static int mcin __P((cset *cs, char *cp)); 109static char *mcfind __P((cset *cs, char *cp)); 110#endif 111static void mcinvert __P((struct parse *p, cset *cs)); 112static void mccase __P((struct parse *p, cset *cs)); 113static int isinsets __P((struct re_guts *g, int c)); 114static int samesets __P((struct re_guts *g, int c1, int c2)); 115static void categorize __P((struct parse *p, struct re_guts *g)); 116static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 117static void doemit __P((struct parse *p, sop op, size_t opnd)); 118static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 119static void dofwd __P((struct parse *p, sopno pos, sop value)); 120static void enlarge __P((struct parse *p, sopno size)); 121static void stripsnug __P((struct parse *p, struct re_guts *g)); 122static void findmust __P((struct parse *p, struct re_guts *g)); 123static sopno pluscount __P((struct parse *p, struct re_guts *g)); 124 125#ifdef __cplusplus 126} 127#endif 128/* ========= end header generated by ./mkh ========= */ 129 130static char nuls[10]; /* place to point scanner in event of error */ 131 132/* 133 * macros for use with parse structure 134 * BEWARE: these know that the parse structure is named `p' !!! 135 */ 136#define PEEK() (*p->next) 137#define PEEK2() (*(p->next+1)) 138#define MORE() (p->next < p->end) 139#define MORE2() (p->next+1 < p->end) 140#define SEE(c) (MORE() && PEEK() == (c)) 141#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 142#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 143#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 144#define NEXT() (p->next++) 145#define NEXT2() (p->next += 2) 146#define NEXTn(n) (p->next += (n)) 147#define GETNEXT() (*p->next++) 148#define SETERROR(e) seterr(p, (e)) 149#define REQUIRE(co, e) ((co) || SETERROR(e)) 150#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 151#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 152#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 153#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 154#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 155#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 156#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 157#define HERE() (p->slen) 158#define THERE() (p->slen - 1) 159#define THERETHERE() (p->slen - 2) 160#define DROP(n) (p->slen -= (n)) 161 162#ifndef NDEBUG 163static int never = 0; /* for use in asserts; shuts lint up */ 164#else 165#define never 0 /* some <assert.h>s have bugs too */ 166#endif 167 168/* 169 - regcomp - interface for parser and compilation 170 = extern int regcomp(regex_t *, const char *, int); 171 = #define REG_BASIC 0000 172 = #define REG_EXTENDED 0001 173 = #define REG_ICASE 0002 174 = #define REG_NOSUB 0004 175 = #define REG_NEWLINE 0010 176 = #define REG_NOSPEC 0020 177 = #define REG_PEND 0040 178 = #define REG_DUMP 0200 179 */ 180int /* 0 success, otherwise REG_something */ 181regcomp(preg, pattern, cflags) 182regex_t *preg; 183const char *pattern; 184int cflags; 185{ 186 struct parse pa; 187 register struct re_guts *g; 188 register struct parse *p = &pa; 189 register int i; 190 register size_t len; 191#ifdef REDEBUG 192# define GOODFLAGS(f) (f) 193#else 194# define GOODFLAGS(f) ((f)&~REG_DUMP) 195#endif 196 197 cflags = GOODFLAGS(cflags); 198 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 199 return(REG_INVARG); 200 201 if (cflags®_PEND) { 202 if (preg->re_endp < pattern) 203 return(REG_INVARG); 204 len = preg->re_endp - pattern; 205 } else 206 len = strlen((char *)pattern); 207 208 /* do the mallocs early so failure handling is easy */ 209 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 210 (NC-1)*sizeof(cat_t)); 211 if (g == NULL) 212 return(REG_ESPACE); 213 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 214 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 215 p->slen = 0; 216 if (p->strip == NULL) { 217 free((char *)g); 218 return(REG_ESPACE); 219 } 220 221 /* set things up */ 222 p->g = g; 223 p->next = (char *)pattern; /* convenience; we do not modify it */ 224 p->end = p->next + len; 225 p->error = 0; 226 p->ncsalloc = 0; 227 for (i = 0; i < NPAREN; i++) { 228 p->pbegin[i] = 0; 229 p->pend[i] = 0; 230 } 231 g->csetsize = NC; 232 g->sets = NULL; 233 g->setbits = NULL; 234 g->ncsets = 0; 235 g->cflags = cflags; 236 g->iflags = 0; 237 g->nbol = 0; 238 g->neol = 0; 239 g->must = NULL; 240 g->mlen = 0; 241 g->nsub = 0; 242 g->ncategories = 1; /* category 0 is "everything else" */ 243 g->categories = &g->catspace[-(CHAR_MIN)]; 244 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 245 g->backrefs = 0; 246 247 /* do it */ 248 EMIT(OEND, 0); 249 g->firststate = THERE(); 250 if (cflags®_EXTENDED) 251 p_ere(p, OUT); 252 else if (cflags®_NOSPEC) 253 p_str(p); 254 else 255 p_bre(p, OUT, OUT); 256 EMIT(OEND, 0); 257 g->laststate = THERE(); 258 259 /* tidy up loose ends and fill things in */ 260 categorize(p, g); 261 stripsnug(p, g); 262 findmust(p, g); 263 g->nplus = pluscount(p, g); 264 g->magic = MAGIC2; 265 preg->re_nsub = g->nsub; 266 preg->re_g = g; 267 preg->re_magic = MAGIC1; 268#ifndef REDEBUG 269 /* not debugging, so can't rely on the assert() in regexec() */ 270 if (g->iflags&BAD) 271 SETERROR(REG_ASSERT); 272#endif 273 274 /* win or lose, we're done */ 275 if (p->error != 0) /* lose */ 276 regfree(preg); 277 return(p->error); 278} 279 280/* 281 - p_ere - ERE parser top level, concatenation and alternation 282 == static void p_ere(register struct parse *p, int stop); 283 */ 284static void 285p_ere(p, stop) 286register struct parse *p; 287int stop; /* character this ERE should end at */ 288{ 289 register char c; 290 register sopno prevback; 291 register sopno prevfwd; 292 register sopno conc; 293 register int first = 1; /* is this the first alternative? */ 294 295 for (;;) { 296 /* do a bunch of concatenated expressions */ 297 conc = HERE(); 298 while (MORE() && (c = PEEK()) != '|' && c != stop) 299 p_ere_exp(p); 300 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 301 302 if (!EAT('|')) 303 break; /* NOTE BREAK OUT */ 304 305 if (first) { 306 INSERT(OCH_, conc); /* offset is wrong */ 307 prevfwd = conc; 308 prevback = conc; 309 first = 0; 310 } 311 ASTERN(OOR1, prevback); 312 prevback = THERE(); 313 AHEAD(prevfwd); /* fix previous offset */ 314 prevfwd = HERE(); 315 EMIT(OOR2, 0); /* offset is very wrong */ 316 } 317 318 if (!first) { /* tail-end fixups */ 319 AHEAD(prevfwd); 320 ASTERN(O_CH, prevback); 321 } 322 323 assert(!MORE() || SEE(stop)); 324} 325 326/* 327 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 328 == static void p_ere_exp(register struct parse *p); 329 */ 330static void 331p_ere_exp(p) 332register struct parse *p; 333{ 334 register char c; 335 register sopno pos; 336 register int count; 337 register int count2; 338 register sopno subno; 339 int wascaret = 0; 340 341 assert(MORE()); /* caller should have ensured this */ 342 c = GETNEXT(); 343 344 pos = HERE(); 345 switch (c) { 346 case '(': 347 (void)REQUIRE(MORE(), REG_EPAREN); 348 p->g->nsub++; 349 subno = p->g->nsub; 350 if (subno < NPAREN) 351 p->pbegin[subno] = HERE(); 352 EMIT(OLPAREN, subno); 353 if (!SEE(')')) 354 p_ere(p, ')'); 355 if (subno < NPAREN) { 356 p->pend[subno] = HERE(); 357 assert(p->pend[subno] != 0); 358 } 359 EMIT(ORPAREN, subno); 360 (void)MUSTEAT(')', REG_EPAREN); 361 break; 362#ifndef POSIX_MISTAKE 363 case ')': /* happens only if no current unmatched ( */ 364 /* 365 * You may ask, why the ifndef? Because I didn't notice 366 * this until slightly too late for 1003.2, and none of the 367 * other 1003.2 regular-expression reviewers noticed it at 368 * all. So an unmatched ) is legal POSIX, at least until 369 * we can get it fixed. 370 */ 371 SETERROR(REG_EPAREN); 372 break; 373#endif 374 case '^': 375 EMIT(OBOL, 0); 376 p->g->iflags |= USEBOL; 377 p->g->nbol++; 378 wascaret = 1; 379 break; 380 case '$': 381 EMIT(OEOL, 0); 382 p->g->iflags |= USEEOL; 383 p->g->neol++; 384 break; 385 case '|': 386 SETERROR(REG_EMPTY); 387 break; 388 case '*': 389 case '+': 390 case '?': 391 SETERROR(REG_BADRPT); 392 break; 393 case '.': 394 if (p->g->cflags®_NEWLINE) 395 nonnewline(p); 396 else 397 EMIT(OANY, 0); 398 break; 399 case '[': 400 p_bracket(p); 401 break; 402 case '\\': 403 (void)REQUIRE(MORE(), REG_EESCAPE); 404 c = GETNEXT(); 405 ordinary(p, c); 406 break; 407 case '{': /* okay as ordinary except if digit follows */ 408 (void)REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 409 /* FALLTHROUGH */ 410 default: 411 ordinary(p, c); 412 break; 413 } 414 415 if (!MORE()) 416 return; 417 c = PEEK(); 418 /* we call { a repetition if followed by a digit */ 419 if (!( c == '*' || c == '+' || c == '?' || 420 (c == '{' && MORE2() && isdigit(PEEK2())) )) 421 return; /* no repetition, we're done */ 422 NEXT(); 423 424 (void)REQUIRE(!wascaret, REG_BADRPT); 425 switch (c) { 426 case '*': /* implemented as +? */ 427 /* this case does not require the (y|) trick, noKLUDGE */ 428 INSERT(OPLUS_, pos); 429 ASTERN(O_PLUS, pos); 430 INSERT(OQUEST_, pos); 431 ASTERN(O_QUEST, pos); 432 break; 433 case '+': 434 INSERT(OPLUS_, pos); 435 ASTERN(O_PLUS, pos); 436 break; 437 case '?': 438 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 439 INSERT(OCH_, pos); /* offset slightly wrong */ 440 ASTERN(OOR1, pos); /* this one's right */ 441 AHEAD(pos); /* fix the OCH_ */ 442 EMIT(OOR2, 0); /* offset very wrong... */ 443 AHEAD(THERE()); /* ...so fix it */ 444 ASTERN(O_CH, THERETHERE()); 445 break; 446 case '{': 447 count = p_count(p); 448 if (EAT(',')) { 449 if (isdigit(PEEK())) { 450 count2 = p_count(p); 451 (void)REQUIRE(count <= count2, REG_BADBR); 452 } else /* single number with comma */ 453 count2 = INFINITY; 454 } else /* just a single number */ 455 count2 = count; 456 repeat(p, pos, count, count2); 457 if (!EAT('}')) { /* error heuristics */ 458 while (MORE() && PEEK() != '}') 459 NEXT(); 460 (void)REQUIRE(MORE(), REG_EBRACE); 461 SETERROR(REG_BADBR); 462 } 463 break; 464 } 465 466 if (!MORE()) 467 return; 468 c = PEEK(); 469 if (!( c == '*' || c == '+' || c == '?' || 470 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 471 return; 472 SETERROR(REG_BADRPT); 473} 474 475/* 476 - p_str - string (no metacharacters) "parser" 477 == static void p_str(register struct parse *p); 478 */ 479static void 480p_str(p) 481register struct parse *p; 482{ 483 (void)REQUIRE(MORE(), REG_EMPTY); 484 while (MORE()) 485 ordinary(p, GETNEXT()); 486} 487 488/* 489 - p_bre - BRE parser top level, anchoring and concatenation 490 == static void p_bre(register struct parse *p, register int end1, \ 491 == register int end2); 492 * Giving end1 as OUT essentially eliminates the end1/end2 check. 493 * 494 * This implementation is a bit of a kludge, in that a trailing $ is first 495 * taken as an ordinary character and then revised to be an anchor. The 496 * only undesirable side effect is that '$' gets included as a character 497 * category in such cases. This is fairly harmless; not worth fixing. 498 * The amount of lookahead needed to avoid this kludge is excessive. 499 */ 500static void 501p_bre(p, end1, end2) 502register struct parse *p; 503register int end1; /* first terminating character */ 504register int end2; /* second terminating character */ 505{ 506 register sopno start = HERE(); 507 register int first = 1; /* first subexpression? */ 508 register int wasdollar = 0; 509 510 if (EAT('^')) { 511 EMIT(OBOL, 0); 512 p->g->iflags |= USEBOL; 513 p->g->nbol++; 514 } 515 while (MORE() && !SEETWO(end1, end2)) { 516 wasdollar = p_simp_re(p, first); 517 first = 0; 518 } 519 if (wasdollar) { /* oops, that was a trailing anchor */ 520 DROP(1); 521 EMIT(OEOL, 0); 522 p->g->iflags |= USEEOL; 523 p->g->neol++; 524 } 525 526 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 527} 528 529/* 530 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 531 == static int p_simp_re(register struct parse *p, int starordinary); 532 */ 533static int /* was the simple RE an unbackslashed $? */ 534p_simp_re(p, starordinary) 535register struct parse *p; 536int starordinary; /* is a leading * an ordinary character? */ 537{ 538 register int c; 539 register int count; 540 register int count2; 541 register sopno pos; 542 register int i; 543 register sopno subno; 544# define BACKSL (1<<CHAR_BIT) 545 546 pos = HERE(); /* repetion op, if any, covers from here */ 547 548 assert(MORE()); /* caller should have ensured this */ 549 c = GETNEXT(); 550 if (c == '\\') { 551 (void)REQUIRE(MORE(), REG_EESCAPE); 552 c = BACKSL | (unsigned char)GETNEXT(); 553 } 554 switch (c) { 555 case '.': 556 if (p->g->cflags®_NEWLINE) 557 nonnewline(p); 558 else 559 EMIT(OANY, 0); 560 break; 561 case '[': 562 p_bracket(p); 563 break; 564 case BACKSL|'{': 565 SETERROR(REG_BADRPT); 566 break; 567 case BACKSL|'(': 568 p->g->nsub++; 569 subno = p->g->nsub; 570 if (subno < NPAREN) 571 p->pbegin[subno] = HERE(); 572 EMIT(OLPAREN, subno); 573 /* the MORE here is an error heuristic */ 574 if (MORE() && !SEETWO('\\', ')')) 575 p_bre(p, '\\', ')'); 576 if (subno < NPAREN) { 577 p->pend[subno] = HERE(); 578 assert(p->pend[subno] != 0); 579 } 580 EMIT(ORPAREN, subno); 581 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 582 break; 583 case BACKSL|')': /* should not get here -- must be user */ 584 case BACKSL|'}': 585 SETERROR(REG_EPAREN); 586 break; 587 case BACKSL|'1': 588 case BACKSL|'2': 589 case BACKSL|'3': 590 case BACKSL|'4': 591 case BACKSL|'5': 592 case BACKSL|'6': 593 case BACKSL|'7': 594 case BACKSL|'8': 595 case BACKSL|'9': 596 i = (c&~BACKSL) - '0'; 597 assert(i < NPAREN); 598 if (p->pend[i] != 0) { 599 assert(i <= p->g->nsub); 600 EMIT(OBACK_, i); 601 assert(p->pbegin[i] != 0); 602 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 603 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 604 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 605 EMIT(O_BACK, i); 606 } else 607 SETERROR(REG_ESUBREG); 608 p->g->backrefs = 1; 609 break; 610 case '*': 611 (void)REQUIRE(starordinary, REG_BADRPT); 612 /* FALLTHROUGH */ 613 default: 614 ordinary(p, c &~ BACKSL); 615 break; 616 } 617 618 if (EAT('*')) { /* implemented as +? */ 619 /* this case does not require the (y|) trick, noKLUDGE */ 620 INSERT(OPLUS_, pos); 621 ASTERN(O_PLUS, pos); 622 INSERT(OQUEST_, pos); 623 ASTERN(O_QUEST, pos); 624 } else if (EATTWO('\\', '{')) { 625 count = p_count(p); 626 if (EAT(',')) { 627 if (MORE() && isdigit(PEEK())) { 628 count2 = p_count(p); 629 (void)REQUIRE(count <= count2, REG_BADBR); 630 } else /* single number with comma */ 631 count2 = INFINITY; 632 } else /* just a single number */ 633 count2 = count; 634 repeat(p, pos, count, count2); 635 if (!EATTWO('\\', '}')) { /* error heuristics */ 636 while (MORE() && !SEETWO('\\', '}')) 637 NEXT(); 638 (void)REQUIRE(MORE(), REG_EBRACE); 639 SETERROR(REG_BADBR); 640 } 641 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 642 return(1); 643 644 return(0); 645} 646 647/* 648 - p_count - parse a repetition count 649 == static int p_count(register struct parse *p); 650 */ 651static int /* the value */ 652p_count(p) 653register struct parse *p; 654{ 655 register int count = 0; 656 register int ndigits = 0; 657 658 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 659 count = count*10 + (GETNEXT() - '0'); 660 ndigits++; 661 } 662 663 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 664 return(count); 665} 666 667/* 668 - p_bracket - parse a bracketed character list 669 == static void p_bracket(register struct parse *p); 670 * 671 * Note a significant property of this code: if the allocset() did SETERROR, 672 * no set operations are done. 673 */ 674static void 675p_bracket(p) 676register struct parse *p; 677{ 678 register cset *cs = allocset(p); 679 register int invert = 0; 680 681 /* Dept of Truly Sickening Special-Case Kludges */ 682 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 683 EMIT(OBOW, 0); 684 NEXTn(6); 685 return; 686 } 687 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 688 EMIT(OEOW, 0); 689 NEXTn(6); 690 return; 691 } 692 693 if (EAT('^')) 694 invert++; /* make note to invert set at end */ 695 if (EAT(']')) 696 CHadd(cs, ']'); 697 else if (EAT('-')) 698 CHadd(cs, '-'); 699 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 700 p_b_term(p, cs); 701 if (EAT('-')) 702 CHadd(cs, '-'); 703 (void)MUSTEAT(']', REG_EBRACK); 704 705 if (p->error != 0) /* don't mess things up further */ 706 return; 707 708 if (p->g->cflags®_ICASE) { 709 register int i; 710 register int ci; 711 712 for (i = p->g->csetsize - 1; i >= 0; i--) 713 if (CHIN(cs, i) && isalpha(i)) { 714 ci = othercase(i); 715 if (ci != i) 716 CHadd(cs, ci); 717 } 718 if (cs->multis != NULL) 719 mccase(p, cs); 720 } 721 if (invert) { 722 register int i; 723 724 for (i = p->g->csetsize - 1; i >= 0; i--) 725 if (CHIN(cs, i)) 726 CHsub(cs, i); 727 else 728 CHadd(cs, i); 729 if (p->g->cflags®_NEWLINE) 730 CHsub(cs, '\n'); 731 if (cs->multis != NULL) 732 mcinvert(p, cs); 733 } 734 735 assert(cs->multis == NULL); /* xxx */ 736 737 if (nch(p, cs) == 1) { /* optimize singleton sets */ 738 ordinary(p, firstch(p, cs)); 739 freeset(p, cs); 740 } else 741 EMIT(OANYOF, freezeset(p, cs)); 742} 743 744/* 745 - p_b_term - parse one term of a bracketed character list 746 == static void p_b_term(register struct parse *p, register cset *cs); 747 */ 748static void 749p_b_term(p, cs) 750register struct parse *p; 751register cset *cs; 752{ 753 register char c; 754 register char start, finish; 755 register int i; 756 757 /* classify what we've got */ 758 switch ((MORE()) ? PEEK() : '\0') { 759 case '[': 760 c = (MORE2()) ? PEEK2() : '\0'; 761 break; 762 case '-': 763 SETERROR(REG_ERANGE); 764 return; /* NOTE RETURN */ 765 break; 766 default: 767 c = '\0'; 768 break; 769 } 770 771 switch (c) { 772 case ':': /* character class */ 773 NEXT2(); 774 (void)REQUIRE(MORE(), REG_EBRACK); 775 c = PEEK(); 776 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 777 p_b_cclass(p, cs); 778 (void)REQUIRE(MORE(), REG_EBRACK); 779 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 780 break; 781 case '=': /* equivalence class */ 782 NEXT2(); 783 (void)REQUIRE(MORE(), REG_EBRACK); 784 c = PEEK(); 785 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 786 p_b_eclass(p, cs); 787 (void)REQUIRE(MORE(), REG_EBRACK); 788 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 789 break; 790 default: /* symbol, ordinary character, or range */ 791/* xxx revision needed for multichar stuff */ 792 start = p_b_symbol(p); 793 if (SEE('-') && MORE2() && PEEK2() != ']') { 794 /* range */ 795 NEXT(); 796 if (EAT('-')) 797 finish = '-'; 798 else 799 finish = p_b_symbol(p); 800 } else 801 finish = start; 802/* xxx what about signed chars here... */ 803 (void)REQUIRE(start <= finish, REG_ERANGE); 804 for (i = start; i <= finish; i++) 805 CHadd(cs, i); 806 break; 807 } 808} 809 810/* 811 - p_b_cclass - parse a character-class name and deal with it 812 == static void p_b_cclass(register struct parse *p, register cset *cs); 813 */ 814static void 815p_b_cclass(p, cs) 816register struct parse *p; 817register cset *cs; 818{ 819 register int c; 820 register char *sp = p->next; 821 register struct cclass *cp; 822 register size_t len; 823 824 while (MORE() && isalpha((uch)PEEK())) 825 NEXT(); 826 len = p->next - sp; 827 for (cp = cclasses; cp->name != NULL; cp++) 828 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 829 break; 830 if (cp->name == NULL) { 831 /* oops, didn't find it */ 832 SETERROR(REG_ECTYPE); 833 return; 834 } 835 836 switch (cp->fidx) { 837 case CALNUM: 838 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 839 if (isalnum((uch)c)) 840 CHadd(cs, c); 841 break; 842 case CALPHA: 843 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 844 if (isalpha((uch)c)) 845 CHadd(cs, c); 846 break; 847 case CBLANK: 848 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 849 if (isblank((uch)c)) 850 CHadd(cs, c); 851 break; 852 case CCNTRL: 853 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 854 if (iscntrl((uch)c)) 855 CHadd(cs, c); 856 break; 857 case CDIGIT: 858 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 859 if (isdigit((uch)c)) 860 CHadd(cs, c); 861 break; 862 case CGRAPH: 863 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 864 if (isgraph((uch)c)) 865 CHadd(cs, c); 866 break; 867 case CLOWER: 868 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 869 if (islower((uch)c)) 870 CHadd(cs, c); 871 break; 872 case CPRINT: 873 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 874 if (isprint((uch)c)) 875 CHadd(cs, c); 876 break; 877 case CPUNCT: 878 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 879 if (ispunct((uch)c)) 880 CHadd(cs, c); 881 break; 882 case CSPACE: 883 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 884 if (isspace((uch)c)) 885 CHadd(cs, c); 886 break; 887 case CUPPER: 888 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 889 if (isupper((uch)c)) 890 CHadd(cs, c); 891 break; 892 case CXDIGIT: 893 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 894 if (isxdigit((uch)c)) 895 CHadd(cs, c); 896 break; 897 } 898#if 0 899 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 900 MCadd(p, cs, u); 901#endif 902} 903 904/* 905 - p_b_eclass - parse an equivalence-class name and deal with it 906 == static void p_b_eclass(register struct parse *p, register cset *cs); 907 * 908 * This implementation is incomplete. xxx 909 */ 910static void 911p_b_eclass(p, cs) 912register struct parse *p; 913register cset *cs; 914{ 915 register char c; 916 917 c = p_b_coll_elem(p, '='); 918 CHadd(cs, c); 919} 920 921/* 922 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 923 == static char p_b_symbol(register struct parse *p); 924 */ 925static char /* value of symbol */ 926p_b_symbol(p) 927register struct parse *p; 928{ 929 register char value; 930 931 (void)REQUIRE(MORE(), REG_EBRACK); 932 if (!EATTWO('[', '.')) 933 return(GETNEXT()); 934 935 /* collating symbol */ 936 value = p_b_coll_elem(p, '.'); 937 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 938 return(value); 939} 940 941/* 942 - p_b_coll_elem - parse a collating-element name and look it up 943 == static char p_b_coll_elem(register struct parse *p, int endc); 944 */ 945static char /* value of collating element */ 946p_b_coll_elem(p, endc) 947register struct parse *p; 948int endc; /* name ended by endc,']' */ 949{ 950 register char *sp = p->next; 951 register struct cname *cp; 952 register int len; 953 954 while (MORE() && !SEETWO(endc, ']')) 955 NEXT(); 956 if (!MORE()) { 957 SETERROR(REG_EBRACK); 958 return(0); 959 } 960 len = p->next - sp; 961 for (cp = cnames; cp->name != NULL; cp++) 962 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 963 return(cp->code); /* known name */ 964 if (len == 1) 965 return(*sp); /* single character */ 966 SETERROR(REG_ECOLLATE); /* neither */ 967 return(0); 968} 969 970/* 971 - othercase - return the case counterpart of an alphabetic 972 == static char othercase(int ch); 973 */ 974static char /* if no counterpart, return ch */ 975othercase(ch) 976int ch; 977{ 978 ch = (unsigned char)ch; 979 assert(isalpha(ch)); 980 if (isupper(ch)) 981 return(tolower(ch)); 982 else if (islower(ch)) 983 return(toupper(ch)); 984 else /* peculiar, but could happen */ 985 return(ch); 986} 987 988/* 989 - bothcases - emit a dualcase version of a two-case character 990 == static void bothcases(register struct parse *p, int ch); 991 * 992 * Boy, is this implementation ever a kludge... 993 */ 994static void 995bothcases(p, ch) 996register struct parse *p; 997int ch; 998{ 999 register char *oldnext = p->next; 1000 register char *oldend = p->end; 1001 char bracket[3]; 1002 1003 ch = (unsigned char)ch; 1004 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1005 p->next = bracket; 1006 p->end = bracket+2; 1007 bracket[0] = ch; 1008 bracket[1] = ']'; 1009 bracket[2] = '\0'; 1010 p_bracket(p); 1011 assert(p->next == bracket+2); 1012 p->next = oldnext; 1013 p->end = oldend; 1014} 1015 1016/* 1017 - ordinary - emit an ordinary character 1018 == static void ordinary(register struct parse *p, register int ch); 1019 */ 1020static void 1021ordinary(p, ch) 1022register struct parse *p; 1023register int ch; 1024{ 1025 register cat_t *cap = p->g->categories; 1026 1027 if ((p->g->cflags®_ICASE) && isalpha((unsigned char)ch) && othercase(ch) != ch) 1028 bothcases(p, ch); 1029 else { 1030 EMIT(OCHAR, (unsigned char)ch); 1031 if (cap[ch] == 0) 1032 cap[ch] = p->g->ncategories++; 1033 } 1034} 1035 1036/* 1037 - nonnewline - emit REG_NEWLINE version of OANY 1038 == static void nonnewline(register struct parse *p); 1039 * 1040 * Boy, is this implementation ever a kludge... 1041 */ 1042static void 1043nonnewline(p) 1044register struct parse *p; 1045{ 1046 register char *oldnext = p->next; 1047 register char *oldend = p->end; 1048 char bracket[4]; 1049 1050 p->next = bracket; 1051 p->end = bracket+3; 1052 bracket[0] = '^'; 1053 bracket[1] = '\n'; 1054 bracket[2] = ']'; 1055 bracket[3] = '\0'; 1056 p_bracket(p); 1057 assert(p->next == bracket+3); 1058 p->next = oldnext; 1059 p->end = oldend; 1060} 1061 1062/* 1063 - repeat - generate code for a bounded repetition, recursively if needed 1064 == static void repeat(register struct parse *p, sopno start, int from, int to); 1065 */ 1066static void 1067repeat(p, start, from, to) 1068register struct parse *p; 1069sopno start; /* operand from here to end of strip */ 1070int from; /* repeated from this number */ 1071int to; /* to this number of times (maybe INFINITY) */ 1072{ 1073 register sopno finish = HERE(); 1074# define N 2 1075# define INF 3 1076# define REP(f, t) ((f)*8 + (t)) 1077# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1078 register sopno copy; 1079 1080 if (p->error != 0) /* head off possible runaway recursion */ 1081 return; 1082 1083 assert(from <= to); 1084 1085 switch (REP(MAP(from), MAP(to))) { 1086 case REP(0, 0): /* must be user doing this */ 1087 DROP(finish-start); /* drop the operand */ 1088 break; 1089 case REP(0, 1): /* as x{1,1}? */ 1090 case REP(0, N): /* as x{1,n}? */ 1091 case REP(0, INF): /* as x{1,}? */ 1092 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1093 INSERT(OCH_, start); /* offset is wrong... */ 1094 repeat(p, start+1, 1, to); 1095 ASTERN(OOR1, start); 1096 AHEAD(start); /* ... fix it */ 1097 EMIT(OOR2, 0); 1098 AHEAD(THERE()); 1099 ASTERN(O_CH, THERETHERE()); 1100 break; 1101 case REP(1, 1): /* trivial case */ 1102 /* done */ 1103 break; 1104 case REP(1, N): /* as x?x{1,n-1} */ 1105 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1106 INSERT(OCH_, start); 1107 ASTERN(OOR1, start); 1108 AHEAD(start); 1109 EMIT(OOR2, 0); /* offset very wrong... */ 1110 AHEAD(THERE()); /* ...so fix it */ 1111 ASTERN(O_CH, THERETHERE()); 1112 copy = dupl(p, start+1, finish+1); 1113 assert(copy == finish+4); 1114 repeat(p, copy, 1, to-1); 1115 break; 1116 case REP(1, INF): /* as x+ */ 1117 INSERT(OPLUS_, start); 1118 ASTERN(O_PLUS, start); 1119 break; 1120 case REP(N, N): /* as xx{m-1,n-1} */ 1121 copy = dupl(p, start, finish); 1122 repeat(p, copy, from-1, to-1); 1123 break; 1124 case REP(N, INF): /* as xx{n-1,INF} */ 1125 copy = dupl(p, start, finish); 1126 repeat(p, copy, from-1, to); 1127 break; 1128 default: /* "can't happen" */ 1129 SETERROR(REG_ASSERT); /* just in case */ 1130 break; 1131 } 1132} 1133 1134/* 1135 - seterr - set an error condition 1136 == static int seterr(register struct parse *p, int e); 1137 */ 1138static int /* useless but makes type checking happy */ 1139seterr(p, e) 1140register struct parse *p; 1141int e; 1142{ 1143 if (p->error == 0) /* keep earliest error condition */ 1144 p->error = e; 1145 p->next = nuls; /* try to bring things to a halt */ 1146 p->end = nuls; 1147 return(0); /* make the return value well-defined */ 1148} 1149 1150/* 1151 - allocset - allocate a set of characters for [] 1152 == static cset *allocset(register struct parse *p); 1153 */ 1154static cset * 1155allocset(p) 1156register struct parse *p; 1157{ 1158 register int no = p->g->ncsets++; 1159 register size_t nc; 1160 register size_t nbytes; 1161 register cset *cs; 1162 register size_t css = (size_t)p->g->csetsize; 1163 register int i; 1164 1165 if (no >= p->ncsalloc) { /* need another column of space */ 1166 p->ncsalloc += CHAR_BIT; 1167 nc = p->ncsalloc; 1168 assert(nc % CHAR_BIT == 0); 1169 nbytes = nc / CHAR_BIT * css; 1170 if (p->g->sets == NULL) 1171 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1172 else 1173 p->g->sets = (cset *)realloc((char *)p->g->sets, 1174 nc * sizeof(cset)); 1175 if (p->g->setbits == NULL) 1176 p->g->setbits = (uch *)malloc(nbytes); 1177 else { 1178 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1179 nbytes); 1180 /* xxx this isn't right if setbits is now NULL */ 1181 for (i = 0; i < no; i++) 1182 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1183 } 1184 if (p->g->sets != NULL && p->g->setbits != NULL) 1185 (void) memset((char *)p->g->setbits + (nbytes - css), 1186 0, css); 1187 else { 1188 no = 0; 1189 SETERROR(REG_ESPACE); 1190 /* caller's responsibility not to do set ops */ 1191 } 1192 } 1193 1194 assert(p->g->sets != NULL); /* xxx */ 1195 cs = &p->g->sets[no]; 1196 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1197 cs->mask = 1 << ((no) % CHAR_BIT); 1198 cs->hash = 0; 1199 cs->smultis = 0; 1200 cs->multis = NULL; 1201 1202 return(cs); 1203} 1204 1205/* 1206 - freeset - free a now-unused set 1207 == static void freeset(register struct parse *p, register cset *cs); 1208 */ 1209static void 1210freeset(p, cs) 1211register struct parse *p; 1212register cset *cs; 1213{ 1214 register int i; 1215 register cset *top = &p->g->sets[p->g->ncsets]; 1216 register size_t css = (size_t)p->g->csetsize; 1217 1218 for (i = 0; i < css; i++) 1219 CHsub(cs, i); 1220 if (cs == top-1) /* recover only the easy case */ 1221 p->g->ncsets--; 1222} 1223 1224/* 1225 - freezeset - final processing on a set of characters 1226 == static int freezeset(register struct parse *p, register cset *cs); 1227 * 1228 * The main task here is merging identical sets. This is usually a waste 1229 * of time (although the hash code minimizes the overhead), but can win 1230 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1231 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1232 * the same value! 1233 */ 1234static int /* set number */ 1235freezeset(p, cs) 1236register struct parse *p; 1237register cset *cs; 1238{ 1239 register uch h = cs->hash; 1240 register int i; 1241 register cset *top = &p->g->sets[p->g->ncsets]; 1242 register cset *cs2; 1243 register size_t css = (size_t)p->g->csetsize; 1244 1245 /* look for an earlier one which is the same */ 1246 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1247 if (cs2->hash == h && cs2 != cs) { 1248 /* maybe */ 1249 for (i = 0; i < css; i++) 1250 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1251 break; /* no */ 1252 if (i == css) 1253 break; /* yes */ 1254 } 1255 1256 if (cs2 < top) { /* found one */ 1257 freeset(p, cs); 1258 cs = cs2; 1259 } 1260 1261 return((int)(cs - p->g->sets)); 1262} 1263 1264/* 1265 - firstch - return first character in a set (which must have at least one) 1266 == static int firstch(register struct parse *p, register cset *cs); 1267 */ 1268static int /* character; there is no "none" value */ 1269firstch(p, cs) 1270register struct parse *p; 1271register cset *cs; 1272{ 1273 register int i; 1274 register size_t css = (size_t)p->g->csetsize; 1275 1276 for (i = 0; i < css; i++) 1277 if (CHIN(cs, i)) 1278 return((unsigned char)i); 1279 assert(never); 1280 return(0); /* arbitrary */ 1281} 1282 1283/* 1284 - nch - number of characters in a set 1285 == static int nch(register struct parse *p, register cset *cs); 1286 */ 1287static int 1288nch(p, cs) 1289register struct parse *p; 1290register cset *cs; 1291{ 1292 register int i; 1293 register size_t css = (size_t)p->g->csetsize; 1294 register int n = 0; 1295 1296 for (i = 0; i < css; i++) 1297 if (CHIN(cs, i)) 1298 n++; 1299 return(n); 1300} 1301 1302/* 1303 - mcadd - add a collating element to a cset 1304 == static void mcadd(register struct parse *p, register cset *cs, \ 1305 == register char *cp); 1306 */ 1307static void 1308mcadd(p, cs, cp) 1309register struct parse *p; 1310register cset *cs; 1311register char *cp; 1312{ 1313 register size_t oldend = cs->smultis; 1314 1315 cs->smultis += strlen(cp) + 1; 1316 if (cs->multis == NULL) 1317 cs->multis = malloc(cs->smultis); 1318 else 1319 cs->multis = realloc(cs->multis, cs->smultis); 1320 if (cs->multis == NULL) { 1321 SETERROR(REG_ESPACE); 1322 return; 1323 } 1324 1325 (void) strcpy(cs->multis + oldend - 1, cp); 1326 cs->multis[cs->smultis - 1] = '\0'; 1327} 1328 1329#if used 1330/* 1331 - mcsub - subtract a collating element from a cset 1332 == static void mcsub(register cset *cs, register char *cp); 1333 */ 1334static void 1335mcsub(cs, cp) 1336register cset *cs; 1337register char *cp; 1338{ 1339 register char *fp = mcfind(cs, cp); 1340 register size_t len = strlen(fp); 1341 1342 assert(fp != NULL); 1343 (void) memmove(fp, fp + len + 1, 1344 cs->smultis - (fp + len + 1 - cs->multis)); 1345 cs->smultis -= len; 1346 1347 if (cs->smultis == 0) { 1348 free(cs->multis); 1349 cs->multis = NULL; 1350 return; 1351 } 1352 1353 cs->multis = realloc(cs->multis, cs->smultis); 1354 assert(cs->multis != NULL); 1355} 1356 1357/* 1358 - mcin - is a collating element in a cset? 1359 == static int mcin(register cset *cs, register char *cp); 1360 */ 1361static int 1362mcin(cs, cp) 1363register cset *cs; 1364register char *cp; 1365{ 1366 return(mcfind(cs, cp) != NULL); 1367} 1368 1369/* 1370 - mcfind - find a collating element in a cset 1371 == static char *mcfind(register cset *cs, register char *cp); 1372 */ 1373static char * 1374mcfind(cs, cp) 1375register cset *cs; 1376register char *cp; 1377{ 1378 register char *p; 1379 1380 if (cs->multis == NULL) 1381 return(NULL); 1382 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1383 if (strcmp(cp, p) == 0) 1384 return(p); 1385 return(NULL); 1386} 1387#endif 1388 1389/* 1390 - mcinvert - invert the list of collating elements in a cset 1391 == static void mcinvert(register struct parse *p, register cset *cs); 1392 * 1393 * This would have to know the set of possibilities. Implementation 1394 * is deferred. 1395 */ 1396static void 1397mcinvert(p, cs) 1398register struct parse *p; 1399register cset *cs; 1400{ 1401 assert(cs->multis == NULL); /* xxx */ 1402} 1403 1404/* 1405 - mccase - add case counterparts of the list of collating elements in a cset 1406 == static void mccase(register struct parse *p, register cset *cs); 1407 * 1408 * This would have to know the set of possibilities. Implementation 1409 * is deferred. 1410 */ 1411static void 1412mccase(p, cs) 1413register struct parse *p; 1414register cset *cs; 1415{ 1416 assert(cs->multis == NULL); /* xxx */ 1417} 1418 1419/* 1420 - isinsets - is this character in any sets? 1421 == static int isinsets(register struct re_guts *g, int c); 1422 */ 1423static int /* predicate */ 1424isinsets(g, c) 1425register struct re_guts *g; 1426int c; 1427{ 1428 register uch *col; 1429 register int i; 1430 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1431 register unsigned uc = (unsigned char)c; 1432 1433 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1434 if (col[uc] != 0) 1435 return(1); 1436 return(0); 1437} 1438 1439/* 1440 - samesets - are these two characters in exactly the same sets? 1441 == static int samesets(register struct re_guts *g, int c1, int c2); 1442 */ 1443static int /* predicate */ 1444samesets(g, c1, c2) 1445register struct re_guts *g; 1446int c1; 1447int c2; 1448{ 1449 register uch *col; 1450 register int i; 1451 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1452 register unsigned uc1 = (unsigned char)c1; 1453 register unsigned uc2 = (unsigned char)c2; 1454 1455 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1456 if (col[uc1] != col[uc2]) 1457 return(0); 1458 return(1); 1459} 1460 1461/* 1462 - categorize - sort out character categories 1463 == static void categorize(struct parse *p, register struct re_guts *g); 1464 */ 1465static void 1466categorize(p, g) 1467struct parse *p; 1468register struct re_guts *g; 1469{ 1470 register cat_t *cats = g->categories; 1471 register int c; 1472 register int c2; 1473 register cat_t cat; 1474 1475 /* avoid making error situations worse */ 1476 if (p->error != 0) 1477 return; 1478 1479 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1480 if (cats[c] == 0 && isinsets(g, c)) { 1481 cat = g->ncategories++; 1482 cats[c] = cat; 1483 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1484 if (cats[c2] == 0 && samesets(g, c, c2)) 1485 cats[c2] = cat; 1486 } 1487} 1488 1489/* 1490 - dupl - emit a duplicate of a bunch of sops 1491 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1492 */ 1493static sopno /* start of duplicate */ 1494dupl(p, start, finish) 1495register struct parse *p; 1496sopno start; /* from here */ 1497sopno finish; /* to this less one */ 1498{ 1499 register sopno ret = HERE(); 1500 register sopno len = finish - start; 1501 1502 assert(finish >= start); 1503 if (len == 0) 1504 return(ret); 1505 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1506 assert(p->ssize >= p->slen + len); 1507 (void) memcpy((char *)(p->strip + p->slen), 1508 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1509 p->slen += len; 1510 return(ret); 1511} 1512 1513/* 1514 - doemit - emit a strip operator 1515 == static void doemit(register struct parse *p, sop op, size_t opnd); 1516 * 1517 * It might seem better to implement this as a macro with a function as 1518 * hard-case backup, but it's just too big and messy unless there are 1519 * some changes to the data structures. Maybe later. 1520 */ 1521static void 1522doemit(p, op, opnd) 1523register struct parse *p; 1524sop op; 1525size_t opnd; 1526{ 1527 /* avoid making error situations worse */ 1528 if (p->error != 0) 1529 return; 1530 1531 /* deal with oversize operands ("can't happen", more or less) */ 1532 assert(opnd < 1<<OPSHIFT); 1533 1534 /* deal with undersized strip */ 1535 if (p->slen >= p->ssize) 1536 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1537 assert(p->slen < p->ssize); 1538 1539 /* finally, it's all reduced to the easy case */ 1540 p->strip[p->slen++] = SOP(op, opnd); 1541} 1542 1543/* 1544 - doinsert - insert a sop into the strip 1545 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1546 */ 1547static void 1548doinsert(p, op, opnd, pos) 1549register struct parse *p; 1550sop op; 1551size_t opnd; 1552sopno pos; 1553{ 1554 register sopno sn; 1555 register sop s; 1556 register int i; 1557 1558 /* avoid making error situations worse */ 1559 if (p->error != 0) 1560 return; 1561 1562 sn = HERE(); 1563 EMIT(op, opnd); /* do checks, ensure space */ 1564 assert(HERE() == sn+1); 1565 s = p->strip[sn]; 1566 1567 /* adjust paren pointers */ 1568 assert(pos > 0); 1569 for (i = 1; i < NPAREN; i++) { 1570 if (p->pbegin[i] >= pos) { 1571 p->pbegin[i]++; 1572 } 1573 if (p->pend[i] >= pos) { 1574 p->pend[i]++; 1575 } 1576 } 1577 1578 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1579 (HERE()-pos-1)*sizeof(sop)); 1580 p->strip[pos] = s; 1581} 1582 1583/* 1584 - dofwd - complete a forward reference 1585 == static void dofwd(register struct parse *p, sopno pos, sop value); 1586 */ 1587static void 1588dofwd(p, pos, value) 1589register struct parse *p; 1590register sopno pos; 1591sop value; 1592{ 1593 /* avoid making error situations worse */ 1594 if (p->error != 0) 1595 return; 1596 1597 assert(value < 1<<OPSHIFT); 1598 p->strip[pos] = OP(p->strip[pos]) | value; 1599} 1600 1601/* 1602 - enlarge - enlarge the strip 1603 == static void enlarge(register struct parse *p, sopno size); 1604 */ 1605static void 1606enlarge(p, size) 1607register struct parse *p; 1608register sopno size; 1609{ 1610 register sop *sp; 1611 1612 if (p->ssize >= size) 1613 return; 1614 1615 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1616 if (sp == NULL) { 1617 SETERROR(REG_ESPACE); 1618 return; 1619 } 1620 p->strip = sp; 1621 p->ssize = size; 1622} 1623 1624/* 1625 - stripsnug - compact the strip 1626 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1627 */ 1628static void 1629stripsnug(p, g) 1630register struct parse *p; 1631register struct re_guts *g; 1632{ 1633 g->nstates = p->slen; 1634 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1635 if (g->strip == NULL) { 1636 SETERROR(REG_ESPACE); 1637 g->strip = p->strip; 1638 } 1639} 1640 1641/* 1642 - findmust - fill in must and mlen with longest mandatory literal string 1643 == static void findmust(register struct parse *p, register struct re_guts *g); 1644 * 1645 * This algorithm could do fancy things like analyzing the operands of | 1646 * for common subsequences. Someday. This code is simple and finds most 1647 * of the interesting cases. 1648 * 1649 * Note that must and mlen got initialized during setup. 1650 */ 1651static void 1652findmust(p, g) 1653struct parse *p; 1654register struct re_guts *g; 1655{ 1656 register sop *scan; 1657 sop *start; 1658 register sop *newstart; 1659 register sopno newlen; 1660 register sop s; 1661 register char *cp; 1662 register sopno i; 1663 1664 /* avoid making error situations worse */ 1665 if (p->error != 0) 1666 return; 1667 1668 /* find the longest OCHAR sequence in strip */ 1669 newlen = 0; 1670 scan = g->strip + 1; 1671 do { 1672 s = *scan++; 1673 switch (OP(s)) { 1674 case OCHAR: /* sequence member */ 1675 if (newlen == 0) /* new sequence */ 1676 newstart = scan - 1; 1677 newlen++; 1678 break; 1679 case OPLUS_: /* things that don't break one */ 1680 case OLPAREN: 1681 case ORPAREN: 1682 break; 1683 case OQUEST_: /* things that must be skipped */ 1684 case OCH_: 1685 scan--; 1686 do { 1687 scan += OPND(s); 1688 s = *scan; 1689 /* assert() interferes w debug printouts */ 1690 if (OP(s) != O_QUEST && OP(s) != O_CH && 1691 OP(s) != OOR2) { 1692 g->iflags |= BAD; 1693 return; 1694 } 1695 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1696 /* fallthrough */ 1697 default: /* things that break a sequence */ 1698 if (newlen > g->mlen) { /* ends one */ 1699 start = newstart; 1700 g->mlen = newlen; 1701 } 1702 newlen = 0; 1703 break; 1704 } 1705 } while (OP(s) != OEND); 1706 1707 if (g->mlen == 0) /* there isn't one */ 1708 return; 1709 1710 /* turn it into a character string */ 1711 g->must = malloc((size_t)g->mlen + 1); 1712 if (g->must == NULL) { /* argh; just forget it */ 1713 g->mlen = 0; 1714 return; 1715 } 1716 cp = g->must; 1717 scan = start; 1718 for (i = g->mlen; i > 0; i--) { 1719 while (OP(s = *scan++) != OCHAR) 1720 continue; 1721 assert(cp < g->must + g->mlen); 1722 *cp++ = (char)OPND(s); 1723 } 1724 assert(cp == g->must + g->mlen); 1725 *cp++ = '\0'; /* just on general principles */ 1726} 1727 1728/* 1729 - pluscount - count + nesting 1730 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1731 */ 1732static sopno /* nesting depth */ 1733pluscount(p, g) 1734struct parse *p; 1735register struct re_guts *g; 1736{ 1737 register sop *scan; 1738 register sop s; 1739 register sopno plusnest = 0; 1740 register sopno maxnest = 0; 1741 1742 if (p->error != 0) 1743 return(0); /* there may not be an OEND */ 1744 1745 scan = g->strip + 1; 1746 do { 1747 s = *scan++; 1748 switch (OP(s)) { 1749 case OPLUS_: 1750 plusnest++; 1751 break; 1752 case O_PLUS: 1753 if (plusnest > maxnest) 1754 maxnest = plusnest; 1755 plusnest--; 1756 break; 1757 } 1758 } while (OP(s) != OEND); 1759 if (plusnest != 0) 1760 g->iflags |= BAD; 1761 return(maxnest); 1762} 1763