1/*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 */ 39 40#if defined(LIBC_SCCS) && !defined(lint) 41static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 42#endif /* LIBC_SCCS and not lint */ 43 44#include <sys/types.h> 45#include <stdio.h> 46#include <string.h> 47#include <ctype.h> 48#include <limits.h>
| 1/*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 */ 39 40#if defined(LIBC_SCCS) && !defined(lint) 41static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 42#endif /* LIBC_SCCS and not lint */ 43 44#include <sys/types.h> 45#include <stdio.h> 46#include <string.h> 47#include <ctype.h> 48#include <limits.h>
|
49#include <locale.h>
| |
50#include <stdlib.h> 51#include <regex.h> 52
| 49#include <stdlib.h> 50#include <regex.h> 51
|
| 52#include "collate.h" 53
|
53#include "utils.h" 54#include "regex2.h" 55 56#include "cclass.h" 57#include "cname.h" 58 59/* 60 * parse structure, passed up and down to avoid global variables and 61 * other clumsinesses 62 */ 63struct parse { 64 char *next; /* next character in RE */ 65 char *end; /* end of string (-> NUL normally) */ 66 int error; /* has an error been seen? */ 67 sop *strip; /* malloced strip */ 68 sopno ssize; /* malloced strip size (allocated) */ 69 sopno slen; /* malloced strip length (used) */ 70 int ncsalloc; /* number of csets allocated */ 71 struct re_guts *g; 72# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 73 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 74 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 75}; 76 77/* ========= begin header generated by ./mkh ========= */ 78#ifdef __cplusplus 79extern "C" { 80#endif 81 82/* === regcomp.c === */ 83static void p_ere __P((struct parse *p, int stop)); 84static void p_ere_exp __P((struct parse *p)); 85static void p_str __P((struct parse *p)); 86static void p_bre __P((struct parse *p, int end1, int end2)); 87static int p_simp_re __P((struct parse *p, int starordinary)); 88static int p_count __P((struct parse *p)); 89static void p_bracket __P((struct parse *p)); 90static void p_b_term __P((struct parse *p, cset *cs)); 91static void p_b_cclass __P((struct parse *p, cset *cs)); 92static void p_b_eclass __P((struct parse *p, cset *cs)); 93static char p_b_symbol __P((struct parse *p)); 94static char p_b_coll_elem __P((struct parse *p, int endc)); 95static char othercase __P((int ch)); 96static void bothcases __P((struct parse *p, int ch)); 97static void ordinary __P((struct parse *p, int ch)); 98static void nonnewline __P((struct parse *p)); 99static void repeat __P((struct parse *p, sopno start, int from, int to)); 100static int seterr __P((struct parse *p, int e)); 101static cset *allocset __P((struct parse *p)); 102static void freeset __P((struct parse *p, cset *cs)); 103static int freezeset __P((struct parse *p, cset *cs)); 104static int firstch __P((struct parse *p, cset *cs)); 105static int nch __P((struct parse *p, cset *cs)); 106static void mcadd __P((struct parse *p, cset *cs, char *cp)); 107#if used 108static void mcsub __P((cset *cs, char *cp)); 109static int mcin __P((cset *cs, char *cp)); 110static char *mcfind __P((cset *cs, char *cp)); 111#endif 112static void mcinvert __P((struct parse *p, cset *cs)); 113static void mccase __P((struct parse *p, cset *cs)); 114static int isinsets __P((struct re_guts *g, int c)); 115static int samesets __P((struct re_guts *g, int c1, int c2)); 116static void categorize __P((struct parse *p, struct re_guts *g)); 117static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 118static void doemit __P((struct parse *p, sop op, size_t opnd)); 119static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 120static void dofwd __P((struct parse *p, sopno pos, sop value)); 121static void enlarge __P((struct parse *p, sopno size)); 122static void stripsnug __P((struct parse *p, struct re_guts *g)); 123static void findmust __P((struct parse *p, struct re_guts *g)); 124static sopno pluscount __P((struct parse *p, struct re_guts *g)); 125 126#ifdef __cplusplus 127} 128#endif 129/* ========= end header generated by ./mkh ========= */ 130 131static char nuls[10]; /* place to point scanner in event of error */ 132 133/* 134 * macros for use with parse structure 135 * BEWARE: these know that the parse structure is named `p' !!! 136 */ 137#define PEEK() (*p->next) 138#define PEEK2() (*(p->next+1)) 139#define MORE() (p->next < p->end) 140#define MORE2() (p->next+1 < p->end) 141#define SEE(c) (MORE() && PEEK() == (c)) 142#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 143#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 144#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 145#define NEXT() (p->next++) 146#define NEXT2() (p->next += 2) 147#define NEXTn(n) (p->next += (n)) 148#define GETNEXT() (*p->next++) 149#define SETERROR(e) seterr(p, (e)) 150#define REQUIRE(co, e) ((co) || SETERROR(e)) 151#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 152#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 153#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 154#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 155#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 156#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 157#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 158#define HERE() (p->slen) 159#define THERE() (p->slen - 1) 160#define THERETHERE() (p->slen - 2) 161#define DROP(n) (p->slen -= (n)) 162 163#ifndef NDEBUG 164static int never = 0; /* for use in asserts; shuts lint up */ 165#else 166#define never 0 /* some <assert.h>s have bugs too */ 167#endif 168 169/* 170 - regcomp - interface for parser and compilation 171 = extern int regcomp(regex_t *, const char *, int); 172 = #define REG_BASIC 0000 173 = #define REG_EXTENDED 0001 174 = #define REG_ICASE 0002 175 = #define REG_NOSUB 0004 176 = #define REG_NEWLINE 0010 177 = #define REG_NOSPEC 0020 178 = #define REG_PEND 0040 179 = #define REG_DUMP 0200 180 */ 181int /* 0 success, otherwise REG_something */ 182regcomp(preg, pattern, cflags) 183regex_t *preg; 184const char *pattern; 185int cflags; 186{ 187 struct parse pa; 188 register struct re_guts *g; 189 register struct parse *p = &pa; 190 register int i; 191 register size_t len; 192#ifdef REDEBUG 193# define GOODFLAGS(f) (f) 194#else 195# define GOODFLAGS(f) ((f)&~REG_DUMP) 196#endif 197 198 cflags = GOODFLAGS(cflags); 199 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 200 return(REG_INVARG); 201 202 if (cflags®_PEND) { 203 if (preg->re_endp < pattern) 204 return(REG_INVARG); 205 len = preg->re_endp - pattern; 206 } else 207 len = strlen((char *)pattern); 208 209 /* do the mallocs early so failure handling is easy */ 210 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 211 (NC-1)*sizeof(cat_t)); 212 if (g == NULL) 213 return(REG_ESPACE); 214 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 215 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 216 p->slen = 0; 217 if (p->strip == NULL) { 218 free((char *)g); 219 return(REG_ESPACE); 220 } 221 222 /* set things up */ 223 p->g = g; 224 p->next = (char *)pattern; /* convenience; we do not modify it */ 225 p->end = p->next + len; 226 p->error = 0; 227 p->ncsalloc = 0; 228 for (i = 0; i < NPAREN; i++) { 229 p->pbegin[i] = 0; 230 p->pend[i] = 0; 231 } 232 g->csetsize = NC; 233 g->sets = NULL; 234 g->setbits = NULL; 235 g->ncsets = 0; 236 g->cflags = cflags; 237 g->iflags = 0; 238 g->nbol = 0; 239 g->neol = 0; 240 g->must = NULL; 241 g->mlen = 0; 242 g->nsub = 0; 243 g->ncategories = 1; /* category 0 is "everything else" */ 244 g->categories = &g->catspace[-(CHAR_MIN)]; 245 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 246 g->backrefs = 0; 247 248 /* do it */ 249 EMIT(OEND, 0); 250 g->firststate = THERE(); 251 if (cflags®_EXTENDED) 252 p_ere(p, OUT); 253 else if (cflags®_NOSPEC) 254 p_str(p); 255 else 256 p_bre(p, OUT, OUT); 257 EMIT(OEND, 0); 258 g->laststate = THERE(); 259 260 /* tidy up loose ends and fill things in */ 261 categorize(p, g); 262 stripsnug(p, g); 263 findmust(p, g); 264 g->nplus = pluscount(p, g); 265 g->magic = MAGIC2; 266 preg->re_nsub = g->nsub; 267 preg->re_g = g; 268 preg->re_magic = MAGIC1; 269#ifndef REDEBUG 270 /* not debugging, so can't rely on the assert() in regexec() */ 271 if (g->iflags&BAD) 272 SETERROR(REG_ASSERT); 273#endif 274 275 /* win or lose, we're done */ 276 if (p->error != 0) /* lose */ 277 regfree(preg); 278 return(p->error); 279} 280 281/* 282 - p_ere - ERE parser top level, concatenation and alternation 283 == static void p_ere(register struct parse *p, int stop); 284 */ 285static void 286p_ere(p, stop) 287register struct parse *p; 288int stop; /* character this ERE should end at */ 289{ 290 register char c; 291 register sopno prevback; 292 register sopno prevfwd; 293 register sopno conc; 294 register int first = 1; /* is this the first alternative? */ 295 296 for (;;) { 297 /* do a bunch of concatenated expressions */ 298 conc = HERE(); 299 while (MORE() && (c = PEEK()) != '|' && c != stop) 300 p_ere_exp(p); 301 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 302 303 if (!EAT('|')) 304 break; /* NOTE BREAK OUT */ 305 306 if (first) { 307 INSERT(OCH_, conc); /* offset is wrong */ 308 prevfwd = conc; 309 prevback = conc; 310 first = 0; 311 } 312 ASTERN(OOR1, prevback); 313 prevback = THERE(); 314 AHEAD(prevfwd); /* fix previous offset */ 315 prevfwd = HERE(); 316 EMIT(OOR2, 0); /* offset is very wrong */ 317 } 318 319 if (!first) { /* tail-end fixups */ 320 AHEAD(prevfwd); 321 ASTERN(O_CH, prevback); 322 } 323 324 assert(!MORE() || SEE(stop)); 325} 326 327/* 328 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 329 == static void p_ere_exp(register struct parse *p); 330 */ 331static void 332p_ere_exp(p) 333register struct parse *p; 334{ 335 register char c; 336 register sopno pos; 337 register int count; 338 register int count2; 339 register sopno subno; 340 int wascaret = 0; 341 342 assert(MORE()); /* caller should have ensured this */ 343 c = GETNEXT(); 344 345 pos = HERE(); 346 switch (c) { 347 case '(': 348 (void)REQUIRE(MORE(), REG_EPAREN); 349 p->g->nsub++; 350 subno = p->g->nsub; 351 if (subno < NPAREN) 352 p->pbegin[subno] = HERE(); 353 EMIT(OLPAREN, subno); 354 if (!SEE(')')) 355 p_ere(p, ')'); 356 if (subno < NPAREN) { 357 p->pend[subno] = HERE(); 358 assert(p->pend[subno] != 0); 359 } 360 EMIT(ORPAREN, subno); 361 (void)MUSTEAT(')', REG_EPAREN); 362 break; 363#ifndef POSIX_MISTAKE 364 case ')': /* happens only if no current unmatched ( */ 365 /* 366 * You may ask, why the ifndef? Because I didn't notice 367 * this until slightly too late for 1003.2, and none of the 368 * other 1003.2 regular-expression reviewers noticed it at 369 * all. So an unmatched ) is legal POSIX, at least until 370 * we can get it fixed. 371 */ 372 SETERROR(REG_EPAREN); 373 break; 374#endif 375 case '^': 376 EMIT(OBOL, 0); 377 p->g->iflags |= USEBOL; 378 p->g->nbol++; 379 wascaret = 1; 380 break; 381 case '$': 382 EMIT(OEOL, 0); 383 p->g->iflags |= USEEOL; 384 p->g->neol++; 385 break; 386 case '|': 387 SETERROR(REG_EMPTY); 388 break; 389 case '*': 390 case '+': 391 case '?': 392 SETERROR(REG_BADRPT); 393 break; 394 case '.': 395 if (p->g->cflags®_NEWLINE) 396 nonnewline(p); 397 else 398 EMIT(OANY, 0); 399 break; 400 case '[': 401 p_bracket(p); 402 break; 403 case '\\': 404 (void)REQUIRE(MORE(), REG_EESCAPE); 405 c = GETNEXT(); 406 ordinary(p, c); 407 break; 408 case '{': /* okay as ordinary except if digit follows */ 409 (void)REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 410 /* FALLTHROUGH */ 411 default: 412 ordinary(p, c); 413 break; 414 } 415 416 if (!MORE()) 417 return; 418 c = PEEK(); 419 /* we call { a repetition if followed by a digit */ 420 if (!( c == '*' || c == '+' || c == '?' || 421 (c == '{' && MORE2() && isdigit(PEEK2())) )) 422 return; /* no repetition, we're done */ 423 NEXT(); 424 425 (void)REQUIRE(!wascaret, REG_BADRPT); 426 switch (c) { 427 case '*': /* implemented as +? */ 428 /* this case does not require the (y|) trick, noKLUDGE */ 429 INSERT(OPLUS_, pos); 430 ASTERN(O_PLUS, pos); 431 INSERT(OQUEST_, pos); 432 ASTERN(O_QUEST, pos); 433 break; 434 case '+': 435 INSERT(OPLUS_, pos); 436 ASTERN(O_PLUS, pos); 437 break; 438 case '?': 439 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 440 INSERT(OCH_, pos); /* offset slightly wrong */ 441 ASTERN(OOR1, pos); /* this one's right */ 442 AHEAD(pos); /* fix the OCH_ */ 443 EMIT(OOR2, 0); /* offset very wrong... */ 444 AHEAD(THERE()); /* ...so fix it */ 445 ASTERN(O_CH, THERETHERE()); 446 break; 447 case '{': 448 count = p_count(p); 449 if (EAT(',')) { 450 if (isdigit(PEEK())) { 451 count2 = p_count(p); 452 (void)REQUIRE(count <= count2, REG_BADBR); 453 } else /* single number with comma */ 454 count2 = INFINITY; 455 } else /* just a single number */ 456 count2 = count; 457 repeat(p, pos, count, count2); 458 if (!EAT('}')) { /* error heuristics */ 459 while (MORE() && PEEK() != '}') 460 NEXT(); 461 (void)REQUIRE(MORE(), REG_EBRACE); 462 SETERROR(REG_BADBR); 463 } 464 break; 465 } 466 467 if (!MORE()) 468 return; 469 c = PEEK(); 470 if (!( c == '*' || c == '+' || c == '?' || 471 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 472 return; 473 SETERROR(REG_BADRPT); 474} 475 476/* 477 - p_str - string (no metacharacters) "parser" 478 == static void p_str(register struct parse *p); 479 */ 480static void 481p_str(p) 482register struct parse *p; 483{ 484 (void)REQUIRE(MORE(), REG_EMPTY); 485 while (MORE()) 486 ordinary(p, GETNEXT()); 487} 488 489/* 490 - p_bre - BRE parser top level, anchoring and concatenation 491 == static void p_bre(register struct parse *p, register int end1, \ 492 == register int end2); 493 * Giving end1 as OUT essentially eliminates the end1/end2 check. 494 * 495 * This implementation is a bit of a kludge, in that a trailing $ is first 496 * taken as an ordinary character and then revised to be an anchor. The 497 * only undesirable side effect is that '$' gets included as a character 498 * category in such cases. This is fairly harmless; not worth fixing. 499 * The amount of lookahead needed to avoid this kludge is excessive. 500 */ 501static void 502p_bre(p, end1, end2) 503register struct parse *p; 504register int end1; /* first terminating character */ 505register int end2; /* second terminating character */ 506{ 507 register sopno start = HERE(); 508 register int first = 1; /* first subexpression? */ 509 register int wasdollar = 0; 510 511 if (EAT('^')) { 512 EMIT(OBOL, 0); 513 p->g->iflags |= USEBOL; 514 p->g->nbol++; 515 } 516 while (MORE() && !SEETWO(end1, end2)) { 517 wasdollar = p_simp_re(p, first); 518 first = 0; 519 } 520 if (wasdollar) { /* oops, that was a trailing anchor */ 521 DROP(1); 522 EMIT(OEOL, 0); 523 p->g->iflags |= USEEOL; 524 p->g->neol++; 525 } 526 527 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 528} 529 530/* 531 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 532 == static int p_simp_re(register struct parse *p, int starordinary); 533 */ 534static int /* was the simple RE an unbackslashed $? */ 535p_simp_re(p, starordinary) 536register struct parse *p; 537int starordinary; /* is a leading * an ordinary character? */ 538{ 539 register int c; 540 register int count; 541 register int count2; 542 register sopno pos; 543 register int i; 544 register sopno subno; 545# define BACKSL (1<<CHAR_BIT) 546 547 pos = HERE(); /* repetion op, if any, covers from here */ 548 549 assert(MORE()); /* caller should have ensured this */ 550 c = GETNEXT(); 551 if (c == '\\') { 552 (void)REQUIRE(MORE(), REG_EESCAPE); 553 c = BACKSL | (unsigned char)GETNEXT(); 554 } 555 switch (c) { 556 case '.': 557 if (p->g->cflags®_NEWLINE) 558 nonnewline(p); 559 else 560 EMIT(OANY, 0); 561 break; 562 case '[': 563 p_bracket(p); 564 break; 565 case BACKSL|'{': 566 SETERROR(REG_BADRPT); 567 break; 568 case BACKSL|'(': 569 p->g->nsub++; 570 subno = p->g->nsub; 571 if (subno < NPAREN) 572 p->pbegin[subno] = HERE(); 573 EMIT(OLPAREN, subno); 574 /* the MORE here is an error heuristic */ 575 if (MORE() && !SEETWO('\\', ')')) 576 p_bre(p, '\\', ')'); 577 if (subno < NPAREN) { 578 p->pend[subno] = HERE(); 579 assert(p->pend[subno] != 0); 580 } 581 EMIT(ORPAREN, subno); 582 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 583 break; 584 case BACKSL|')': /* should not get here -- must be user */ 585 case BACKSL|'}': 586 SETERROR(REG_EPAREN); 587 break; 588 case BACKSL|'1': 589 case BACKSL|'2': 590 case BACKSL|'3': 591 case BACKSL|'4': 592 case BACKSL|'5': 593 case BACKSL|'6': 594 case BACKSL|'7': 595 case BACKSL|'8': 596 case BACKSL|'9': 597 i = (c&~BACKSL) - '0'; 598 assert(i < NPAREN); 599 if (p->pend[i] != 0) { 600 assert(i <= p->g->nsub); 601 EMIT(OBACK_, i); 602 assert(p->pbegin[i] != 0); 603 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 604 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 605 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 606 EMIT(O_BACK, i); 607 } else 608 SETERROR(REG_ESUBREG); 609 p->g->backrefs = 1; 610 break; 611 case '*': 612 (void)REQUIRE(starordinary, REG_BADRPT); 613 /* FALLTHROUGH */ 614 default: 615 ordinary(p, c &~ BACKSL); 616 break; 617 } 618 619 if (EAT('*')) { /* implemented as +? */ 620 /* this case does not require the (y|) trick, noKLUDGE */ 621 INSERT(OPLUS_, pos); 622 ASTERN(O_PLUS, pos); 623 INSERT(OQUEST_, pos); 624 ASTERN(O_QUEST, pos); 625 } else if (EATTWO('\\', '{')) { 626 count = p_count(p); 627 if (EAT(',')) { 628 if (MORE() && isdigit(PEEK())) { 629 count2 = p_count(p); 630 (void)REQUIRE(count <= count2, REG_BADBR); 631 } else /* single number with comma */ 632 count2 = INFINITY; 633 } else /* just a single number */ 634 count2 = count; 635 repeat(p, pos, count, count2); 636 if (!EATTWO('\\', '}')) { /* error heuristics */ 637 while (MORE() && !SEETWO('\\', '}')) 638 NEXT(); 639 (void)REQUIRE(MORE(), REG_EBRACE); 640 SETERROR(REG_BADBR); 641 } 642 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 643 return(1); 644 645 return(0); 646} 647 648/* 649 - p_count - parse a repetition count 650 == static int p_count(register struct parse *p); 651 */ 652static int /* the value */ 653p_count(p) 654register struct parse *p; 655{ 656 register int count = 0; 657 register int ndigits = 0; 658 659 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 660 count = count*10 + (GETNEXT() - '0'); 661 ndigits++; 662 } 663 664 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 665 return(count); 666} 667 668/* 669 - p_bracket - parse a bracketed character list 670 == static void p_bracket(register struct parse *p); 671 * 672 * Note a significant property of this code: if the allocset() did SETERROR, 673 * no set operations are done. 674 */ 675static void 676p_bracket(p) 677register struct parse *p; 678{ 679 register cset *cs = allocset(p); 680 register int invert = 0; 681 682 /* Dept of Truly Sickening Special-Case Kludges */ 683 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 684 EMIT(OBOW, 0); 685 NEXTn(6); 686 return; 687 } 688 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 689 EMIT(OEOW, 0); 690 NEXTn(6); 691 return; 692 } 693 694 if (EAT('^')) 695 invert++; /* make note to invert set at end */ 696 if (EAT(']')) 697 CHadd(cs, ']'); 698 else if (EAT('-')) 699 CHadd(cs, '-'); 700 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 701 p_b_term(p, cs); 702 if (EAT('-')) 703 CHadd(cs, '-'); 704 (void)MUSTEAT(']', REG_EBRACK); 705 706 if (p->error != 0) /* don't mess things up further */ 707 return; 708 709 if (p->g->cflags®_ICASE) { 710 register int i; 711 register int ci; 712 713 for (i = p->g->csetsize - 1; i >= 0; i--) 714 if (CHIN(cs, i) && isalpha(i)) { 715 ci = othercase(i); 716 if (ci != i) 717 CHadd(cs, ci); 718 } 719 if (cs->multis != NULL) 720 mccase(p, cs); 721 } 722 if (invert) { 723 register int i; 724 725 for (i = p->g->csetsize - 1; i >= 0; i--) 726 if (CHIN(cs, i)) 727 CHsub(cs, i); 728 else 729 CHadd(cs, i); 730 if (p->g->cflags®_NEWLINE) 731 CHsub(cs, '\n'); 732 if (cs->multis != NULL) 733 mcinvert(p, cs); 734 } 735 736 assert(cs->multis == NULL); /* xxx */ 737 738 if (nch(p, cs) == 1) { /* optimize singleton sets */ 739 ordinary(p, firstch(p, cs)); 740 freeset(p, cs); 741 } else 742 EMIT(OANYOF, freezeset(p, cs)); 743} 744 745/* 746 - p_b_term - parse one term of a bracketed character list 747 == static void p_b_term(register struct parse *p, register cset *cs); 748 */ 749static void 750p_b_term(p, cs) 751register struct parse *p; 752register cset *cs; 753{ 754 register char c; 755 register char start, finish; 756 register int i; 757 758 /* classify what we've got */ 759 switch ((MORE()) ? PEEK() : '\0') { 760 case '[': 761 c = (MORE2()) ? PEEK2() : '\0'; 762 break; 763 case '-': 764 SETERROR(REG_ERANGE); 765 return; /* NOTE RETURN */ 766 break; 767 default: 768 c = '\0'; 769 break; 770 } 771 772 switch (c) { 773 case ':': /* character class */ 774 NEXT2(); 775 (void)REQUIRE(MORE(), REG_EBRACK); 776 c = PEEK(); 777 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 778 p_b_cclass(p, cs); 779 (void)REQUIRE(MORE(), REG_EBRACK); 780 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 781 break; 782 case '=': /* equivalence class */ 783 NEXT2(); 784 (void)REQUIRE(MORE(), REG_EBRACK); 785 c = PEEK(); 786 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 787 p_b_eclass(p, cs); 788 (void)REQUIRE(MORE(), REG_EBRACK); 789 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 790 break; 791 default: /* symbol, ordinary character, or range */ 792/* xxx revision needed for multichar stuff */ 793 start = p_b_symbol(p); 794 if (SEE('-') && MORE2() && PEEK2() != ']') { 795 /* range */ 796 NEXT(); 797 if (EAT('-')) 798 finish = '-'; 799 else 800 finish = p_b_symbol(p); 801 } else 802 finish = start; 803 if (start == finish) 804 CHadd(cs, start); 805 else {
| 54#include "utils.h" 55#include "regex2.h" 56 57#include "cclass.h" 58#include "cname.h" 59 60/* 61 * parse structure, passed up and down to avoid global variables and 62 * other clumsinesses 63 */ 64struct parse { 65 char *next; /* next character in RE */ 66 char *end; /* end of string (-> NUL normally) */ 67 int error; /* has an error been seen? */ 68 sop *strip; /* malloced strip */ 69 sopno ssize; /* malloced strip size (allocated) */ 70 sopno slen; /* malloced strip length (used) */ 71 int ncsalloc; /* number of csets allocated */ 72 struct re_guts *g; 73# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 74 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 75 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 76}; 77 78/* ========= begin header generated by ./mkh ========= */ 79#ifdef __cplusplus 80extern "C" { 81#endif 82 83/* === regcomp.c === */ 84static void p_ere __P((struct parse *p, int stop)); 85static void p_ere_exp __P((struct parse *p)); 86static void p_str __P((struct parse *p)); 87static void p_bre __P((struct parse *p, int end1, int end2)); 88static int p_simp_re __P((struct parse *p, int starordinary)); 89static int p_count __P((struct parse *p)); 90static void p_bracket __P((struct parse *p)); 91static void p_b_term __P((struct parse *p, cset *cs)); 92static void p_b_cclass __P((struct parse *p, cset *cs)); 93static void p_b_eclass __P((struct parse *p, cset *cs)); 94static char p_b_symbol __P((struct parse *p)); 95static char p_b_coll_elem __P((struct parse *p, int endc)); 96static char othercase __P((int ch)); 97static void bothcases __P((struct parse *p, int ch)); 98static void ordinary __P((struct parse *p, int ch)); 99static void nonnewline __P((struct parse *p)); 100static void repeat __P((struct parse *p, sopno start, int from, int to)); 101static int seterr __P((struct parse *p, int e)); 102static cset *allocset __P((struct parse *p)); 103static void freeset __P((struct parse *p, cset *cs)); 104static int freezeset __P((struct parse *p, cset *cs)); 105static int firstch __P((struct parse *p, cset *cs)); 106static int nch __P((struct parse *p, cset *cs)); 107static void mcadd __P((struct parse *p, cset *cs, char *cp)); 108#if used 109static void mcsub __P((cset *cs, char *cp)); 110static int mcin __P((cset *cs, char *cp)); 111static char *mcfind __P((cset *cs, char *cp)); 112#endif 113static void mcinvert __P((struct parse *p, cset *cs)); 114static void mccase __P((struct parse *p, cset *cs)); 115static int isinsets __P((struct re_guts *g, int c)); 116static int samesets __P((struct re_guts *g, int c1, int c2)); 117static void categorize __P((struct parse *p, struct re_guts *g)); 118static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 119static void doemit __P((struct parse *p, sop op, size_t opnd)); 120static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 121static void dofwd __P((struct parse *p, sopno pos, sop value)); 122static void enlarge __P((struct parse *p, sopno size)); 123static void stripsnug __P((struct parse *p, struct re_guts *g)); 124static void findmust __P((struct parse *p, struct re_guts *g)); 125static sopno pluscount __P((struct parse *p, struct re_guts *g)); 126 127#ifdef __cplusplus 128} 129#endif 130/* ========= end header generated by ./mkh ========= */ 131 132static char nuls[10]; /* place to point scanner in event of error */ 133 134/* 135 * macros for use with parse structure 136 * BEWARE: these know that the parse structure is named `p' !!! 137 */ 138#define PEEK() (*p->next) 139#define PEEK2() (*(p->next+1)) 140#define MORE() (p->next < p->end) 141#define MORE2() (p->next+1 < p->end) 142#define SEE(c) (MORE() && PEEK() == (c)) 143#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 144#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 145#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 146#define NEXT() (p->next++) 147#define NEXT2() (p->next += 2) 148#define NEXTn(n) (p->next += (n)) 149#define GETNEXT() (*p->next++) 150#define SETERROR(e) seterr(p, (e)) 151#define REQUIRE(co, e) ((co) || SETERROR(e)) 152#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 153#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 154#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 155#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 156#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 157#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 158#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 159#define HERE() (p->slen) 160#define THERE() (p->slen - 1) 161#define THERETHERE() (p->slen - 2) 162#define DROP(n) (p->slen -= (n)) 163 164#ifndef NDEBUG 165static int never = 0; /* for use in asserts; shuts lint up */ 166#else 167#define never 0 /* some <assert.h>s have bugs too */ 168#endif 169 170/* 171 - regcomp - interface for parser and compilation 172 = extern int regcomp(regex_t *, const char *, int); 173 = #define REG_BASIC 0000 174 = #define REG_EXTENDED 0001 175 = #define REG_ICASE 0002 176 = #define REG_NOSUB 0004 177 = #define REG_NEWLINE 0010 178 = #define REG_NOSPEC 0020 179 = #define REG_PEND 0040 180 = #define REG_DUMP 0200 181 */ 182int /* 0 success, otherwise REG_something */ 183regcomp(preg, pattern, cflags) 184regex_t *preg; 185const char *pattern; 186int cflags; 187{ 188 struct parse pa; 189 register struct re_guts *g; 190 register struct parse *p = &pa; 191 register int i; 192 register size_t len; 193#ifdef REDEBUG 194# define GOODFLAGS(f) (f) 195#else 196# define GOODFLAGS(f) ((f)&~REG_DUMP) 197#endif 198 199 cflags = GOODFLAGS(cflags); 200 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 201 return(REG_INVARG); 202 203 if (cflags®_PEND) { 204 if (preg->re_endp < pattern) 205 return(REG_INVARG); 206 len = preg->re_endp - pattern; 207 } else 208 len = strlen((char *)pattern); 209 210 /* do the mallocs early so failure handling is easy */ 211 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 212 (NC-1)*sizeof(cat_t)); 213 if (g == NULL) 214 return(REG_ESPACE); 215 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 216 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 217 p->slen = 0; 218 if (p->strip == NULL) { 219 free((char *)g); 220 return(REG_ESPACE); 221 } 222 223 /* set things up */ 224 p->g = g; 225 p->next = (char *)pattern; /* convenience; we do not modify it */ 226 p->end = p->next + len; 227 p->error = 0; 228 p->ncsalloc = 0; 229 for (i = 0; i < NPAREN; i++) { 230 p->pbegin[i] = 0; 231 p->pend[i] = 0; 232 } 233 g->csetsize = NC; 234 g->sets = NULL; 235 g->setbits = NULL; 236 g->ncsets = 0; 237 g->cflags = cflags; 238 g->iflags = 0; 239 g->nbol = 0; 240 g->neol = 0; 241 g->must = NULL; 242 g->mlen = 0; 243 g->nsub = 0; 244 g->ncategories = 1; /* category 0 is "everything else" */ 245 g->categories = &g->catspace[-(CHAR_MIN)]; 246 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 247 g->backrefs = 0; 248 249 /* do it */ 250 EMIT(OEND, 0); 251 g->firststate = THERE(); 252 if (cflags®_EXTENDED) 253 p_ere(p, OUT); 254 else if (cflags®_NOSPEC) 255 p_str(p); 256 else 257 p_bre(p, OUT, OUT); 258 EMIT(OEND, 0); 259 g->laststate = THERE(); 260 261 /* tidy up loose ends and fill things in */ 262 categorize(p, g); 263 stripsnug(p, g); 264 findmust(p, g); 265 g->nplus = pluscount(p, g); 266 g->magic = MAGIC2; 267 preg->re_nsub = g->nsub; 268 preg->re_g = g; 269 preg->re_magic = MAGIC1; 270#ifndef REDEBUG 271 /* not debugging, so can't rely on the assert() in regexec() */ 272 if (g->iflags&BAD) 273 SETERROR(REG_ASSERT); 274#endif 275 276 /* win or lose, we're done */ 277 if (p->error != 0) /* lose */ 278 regfree(preg); 279 return(p->error); 280} 281 282/* 283 - p_ere - ERE parser top level, concatenation and alternation 284 == static void p_ere(register struct parse *p, int stop); 285 */ 286static void 287p_ere(p, stop) 288register struct parse *p; 289int stop; /* character this ERE should end at */ 290{ 291 register char c; 292 register sopno prevback; 293 register sopno prevfwd; 294 register sopno conc; 295 register int first = 1; /* is this the first alternative? */ 296 297 for (;;) { 298 /* do a bunch of concatenated expressions */ 299 conc = HERE(); 300 while (MORE() && (c = PEEK()) != '|' && c != stop) 301 p_ere_exp(p); 302 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 303 304 if (!EAT('|')) 305 break; /* NOTE BREAK OUT */ 306 307 if (first) { 308 INSERT(OCH_, conc); /* offset is wrong */ 309 prevfwd = conc; 310 prevback = conc; 311 first = 0; 312 } 313 ASTERN(OOR1, prevback); 314 prevback = THERE(); 315 AHEAD(prevfwd); /* fix previous offset */ 316 prevfwd = HERE(); 317 EMIT(OOR2, 0); /* offset is very wrong */ 318 } 319 320 if (!first) { /* tail-end fixups */ 321 AHEAD(prevfwd); 322 ASTERN(O_CH, prevback); 323 } 324 325 assert(!MORE() || SEE(stop)); 326} 327 328/* 329 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 330 == static void p_ere_exp(register struct parse *p); 331 */ 332static void 333p_ere_exp(p) 334register struct parse *p; 335{ 336 register char c; 337 register sopno pos; 338 register int count; 339 register int count2; 340 register sopno subno; 341 int wascaret = 0; 342 343 assert(MORE()); /* caller should have ensured this */ 344 c = GETNEXT(); 345 346 pos = HERE(); 347 switch (c) { 348 case '(': 349 (void)REQUIRE(MORE(), REG_EPAREN); 350 p->g->nsub++; 351 subno = p->g->nsub; 352 if (subno < NPAREN) 353 p->pbegin[subno] = HERE(); 354 EMIT(OLPAREN, subno); 355 if (!SEE(')')) 356 p_ere(p, ')'); 357 if (subno < NPAREN) { 358 p->pend[subno] = HERE(); 359 assert(p->pend[subno] != 0); 360 } 361 EMIT(ORPAREN, subno); 362 (void)MUSTEAT(')', REG_EPAREN); 363 break; 364#ifndef POSIX_MISTAKE 365 case ')': /* happens only if no current unmatched ( */ 366 /* 367 * You may ask, why the ifndef? Because I didn't notice 368 * this until slightly too late for 1003.2, and none of the 369 * other 1003.2 regular-expression reviewers noticed it at 370 * all. So an unmatched ) is legal POSIX, at least until 371 * we can get it fixed. 372 */ 373 SETERROR(REG_EPAREN); 374 break; 375#endif 376 case '^': 377 EMIT(OBOL, 0); 378 p->g->iflags |= USEBOL; 379 p->g->nbol++; 380 wascaret = 1; 381 break; 382 case '$': 383 EMIT(OEOL, 0); 384 p->g->iflags |= USEEOL; 385 p->g->neol++; 386 break; 387 case '|': 388 SETERROR(REG_EMPTY); 389 break; 390 case '*': 391 case '+': 392 case '?': 393 SETERROR(REG_BADRPT); 394 break; 395 case '.': 396 if (p->g->cflags®_NEWLINE) 397 nonnewline(p); 398 else 399 EMIT(OANY, 0); 400 break; 401 case '[': 402 p_bracket(p); 403 break; 404 case '\\': 405 (void)REQUIRE(MORE(), REG_EESCAPE); 406 c = GETNEXT(); 407 ordinary(p, c); 408 break; 409 case '{': /* okay as ordinary except if digit follows */ 410 (void)REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 411 /* FALLTHROUGH */ 412 default: 413 ordinary(p, c); 414 break; 415 } 416 417 if (!MORE()) 418 return; 419 c = PEEK(); 420 /* we call { a repetition if followed by a digit */ 421 if (!( c == '*' || c == '+' || c == '?' || 422 (c == '{' && MORE2() && isdigit(PEEK2())) )) 423 return; /* no repetition, we're done */ 424 NEXT(); 425 426 (void)REQUIRE(!wascaret, REG_BADRPT); 427 switch (c) { 428 case '*': /* implemented as +? */ 429 /* this case does not require the (y|) trick, noKLUDGE */ 430 INSERT(OPLUS_, pos); 431 ASTERN(O_PLUS, pos); 432 INSERT(OQUEST_, pos); 433 ASTERN(O_QUEST, pos); 434 break; 435 case '+': 436 INSERT(OPLUS_, pos); 437 ASTERN(O_PLUS, pos); 438 break; 439 case '?': 440 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 441 INSERT(OCH_, pos); /* offset slightly wrong */ 442 ASTERN(OOR1, pos); /* this one's right */ 443 AHEAD(pos); /* fix the OCH_ */ 444 EMIT(OOR2, 0); /* offset very wrong... */ 445 AHEAD(THERE()); /* ...so fix it */ 446 ASTERN(O_CH, THERETHERE()); 447 break; 448 case '{': 449 count = p_count(p); 450 if (EAT(',')) { 451 if (isdigit(PEEK())) { 452 count2 = p_count(p); 453 (void)REQUIRE(count <= count2, REG_BADBR); 454 } else /* single number with comma */ 455 count2 = INFINITY; 456 } else /* just a single number */ 457 count2 = count; 458 repeat(p, pos, count, count2); 459 if (!EAT('}')) { /* error heuristics */ 460 while (MORE() && PEEK() != '}') 461 NEXT(); 462 (void)REQUIRE(MORE(), REG_EBRACE); 463 SETERROR(REG_BADBR); 464 } 465 break; 466 } 467 468 if (!MORE()) 469 return; 470 c = PEEK(); 471 if (!( c == '*' || c == '+' || c == '?' || 472 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 473 return; 474 SETERROR(REG_BADRPT); 475} 476 477/* 478 - p_str - string (no metacharacters) "parser" 479 == static void p_str(register struct parse *p); 480 */ 481static void 482p_str(p) 483register struct parse *p; 484{ 485 (void)REQUIRE(MORE(), REG_EMPTY); 486 while (MORE()) 487 ordinary(p, GETNEXT()); 488} 489 490/* 491 - p_bre - BRE parser top level, anchoring and concatenation 492 == static void p_bre(register struct parse *p, register int end1, \ 493 == register int end2); 494 * Giving end1 as OUT essentially eliminates the end1/end2 check. 495 * 496 * This implementation is a bit of a kludge, in that a trailing $ is first 497 * taken as an ordinary character and then revised to be an anchor. The 498 * only undesirable side effect is that '$' gets included as a character 499 * category in such cases. This is fairly harmless; not worth fixing. 500 * The amount of lookahead needed to avoid this kludge is excessive. 501 */ 502static void 503p_bre(p, end1, end2) 504register struct parse *p; 505register int end1; /* first terminating character */ 506register int end2; /* second terminating character */ 507{ 508 register sopno start = HERE(); 509 register int first = 1; /* first subexpression? */ 510 register int wasdollar = 0; 511 512 if (EAT('^')) { 513 EMIT(OBOL, 0); 514 p->g->iflags |= USEBOL; 515 p->g->nbol++; 516 } 517 while (MORE() && !SEETWO(end1, end2)) { 518 wasdollar = p_simp_re(p, first); 519 first = 0; 520 } 521 if (wasdollar) { /* oops, that was a trailing anchor */ 522 DROP(1); 523 EMIT(OEOL, 0); 524 p->g->iflags |= USEEOL; 525 p->g->neol++; 526 } 527 528 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 529} 530 531/* 532 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 533 == static int p_simp_re(register struct parse *p, int starordinary); 534 */ 535static int /* was the simple RE an unbackslashed $? */ 536p_simp_re(p, starordinary) 537register struct parse *p; 538int starordinary; /* is a leading * an ordinary character? */ 539{ 540 register int c; 541 register int count; 542 register int count2; 543 register sopno pos; 544 register int i; 545 register sopno subno; 546# define BACKSL (1<<CHAR_BIT) 547 548 pos = HERE(); /* repetion op, if any, covers from here */ 549 550 assert(MORE()); /* caller should have ensured this */ 551 c = GETNEXT(); 552 if (c == '\\') { 553 (void)REQUIRE(MORE(), REG_EESCAPE); 554 c = BACKSL | (unsigned char)GETNEXT(); 555 } 556 switch (c) { 557 case '.': 558 if (p->g->cflags®_NEWLINE) 559 nonnewline(p); 560 else 561 EMIT(OANY, 0); 562 break; 563 case '[': 564 p_bracket(p); 565 break; 566 case BACKSL|'{': 567 SETERROR(REG_BADRPT); 568 break; 569 case BACKSL|'(': 570 p->g->nsub++; 571 subno = p->g->nsub; 572 if (subno < NPAREN) 573 p->pbegin[subno] = HERE(); 574 EMIT(OLPAREN, subno); 575 /* the MORE here is an error heuristic */ 576 if (MORE() && !SEETWO('\\', ')')) 577 p_bre(p, '\\', ')'); 578 if (subno < NPAREN) { 579 p->pend[subno] = HERE(); 580 assert(p->pend[subno] != 0); 581 } 582 EMIT(ORPAREN, subno); 583 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 584 break; 585 case BACKSL|')': /* should not get here -- must be user */ 586 case BACKSL|'}': 587 SETERROR(REG_EPAREN); 588 break; 589 case BACKSL|'1': 590 case BACKSL|'2': 591 case BACKSL|'3': 592 case BACKSL|'4': 593 case BACKSL|'5': 594 case BACKSL|'6': 595 case BACKSL|'7': 596 case BACKSL|'8': 597 case BACKSL|'9': 598 i = (c&~BACKSL) - '0'; 599 assert(i < NPAREN); 600 if (p->pend[i] != 0) { 601 assert(i <= p->g->nsub); 602 EMIT(OBACK_, i); 603 assert(p->pbegin[i] != 0); 604 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 605 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 606 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 607 EMIT(O_BACK, i); 608 } else 609 SETERROR(REG_ESUBREG); 610 p->g->backrefs = 1; 611 break; 612 case '*': 613 (void)REQUIRE(starordinary, REG_BADRPT); 614 /* FALLTHROUGH */ 615 default: 616 ordinary(p, c &~ BACKSL); 617 break; 618 } 619 620 if (EAT('*')) { /* implemented as +? */ 621 /* this case does not require the (y|) trick, noKLUDGE */ 622 INSERT(OPLUS_, pos); 623 ASTERN(O_PLUS, pos); 624 INSERT(OQUEST_, pos); 625 ASTERN(O_QUEST, pos); 626 } else if (EATTWO('\\', '{')) { 627 count = p_count(p); 628 if (EAT(',')) { 629 if (MORE() && isdigit(PEEK())) { 630 count2 = p_count(p); 631 (void)REQUIRE(count <= count2, REG_BADBR); 632 } else /* single number with comma */ 633 count2 = INFINITY; 634 } else /* just a single number */ 635 count2 = count; 636 repeat(p, pos, count, count2); 637 if (!EATTWO('\\', '}')) { /* error heuristics */ 638 while (MORE() && !SEETWO('\\', '}')) 639 NEXT(); 640 (void)REQUIRE(MORE(), REG_EBRACE); 641 SETERROR(REG_BADBR); 642 } 643 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 644 return(1); 645 646 return(0); 647} 648 649/* 650 - p_count - parse a repetition count 651 == static int p_count(register struct parse *p); 652 */ 653static int /* the value */ 654p_count(p) 655register struct parse *p; 656{ 657 register int count = 0; 658 register int ndigits = 0; 659 660 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 661 count = count*10 + (GETNEXT() - '0'); 662 ndigits++; 663 } 664 665 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 666 return(count); 667} 668 669/* 670 - p_bracket - parse a bracketed character list 671 == static void p_bracket(register struct parse *p); 672 * 673 * Note a significant property of this code: if the allocset() did SETERROR, 674 * no set operations are done. 675 */ 676static void 677p_bracket(p) 678register struct parse *p; 679{ 680 register cset *cs = allocset(p); 681 register int invert = 0; 682 683 /* Dept of Truly Sickening Special-Case Kludges */ 684 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 685 EMIT(OBOW, 0); 686 NEXTn(6); 687 return; 688 } 689 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 690 EMIT(OEOW, 0); 691 NEXTn(6); 692 return; 693 } 694 695 if (EAT('^')) 696 invert++; /* make note to invert set at end */ 697 if (EAT(']')) 698 CHadd(cs, ']'); 699 else if (EAT('-')) 700 CHadd(cs, '-'); 701 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 702 p_b_term(p, cs); 703 if (EAT('-')) 704 CHadd(cs, '-'); 705 (void)MUSTEAT(']', REG_EBRACK); 706 707 if (p->error != 0) /* don't mess things up further */ 708 return; 709 710 if (p->g->cflags®_ICASE) { 711 register int i; 712 register int ci; 713 714 for (i = p->g->csetsize - 1; i >= 0; i--) 715 if (CHIN(cs, i) && isalpha(i)) { 716 ci = othercase(i); 717 if (ci != i) 718 CHadd(cs, ci); 719 } 720 if (cs->multis != NULL) 721 mccase(p, cs); 722 } 723 if (invert) { 724 register int i; 725 726 for (i = p->g->csetsize - 1; i >= 0; i--) 727 if (CHIN(cs, i)) 728 CHsub(cs, i); 729 else 730 CHadd(cs, i); 731 if (p->g->cflags®_NEWLINE) 732 CHsub(cs, '\n'); 733 if (cs->multis != NULL) 734 mcinvert(p, cs); 735 } 736 737 assert(cs->multis == NULL); /* xxx */ 738 739 if (nch(p, cs) == 1) { /* optimize singleton sets */ 740 ordinary(p, firstch(p, cs)); 741 freeset(p, cs); 742 } else 743 EMIT(OANYOF, freezeset(p, cs)); 744} 745 746/* 747 - p_b_term - parse one term of a bracketed character list 748 == static void p_b_term(register struct parse *p, register cset *cs); 749 */ 750static void 751p_b_term(p, cs) 752register struct parse *p; 753register cset *cs; 754{ 755 register char c; 756 register char start, finish; 757 register int i; 758 759 /* classify what we've got */ 760 switch ((MORE()) ? PEEK() : '\0') { 761 case '[': 762 c = (MORE2()) ? PEEK2() : '\0'; 763 break; 764 case '-': 765 SETERROR(REG_ERANGE); 766 return; /* NOTE RETURN */ 767 break; 768 default: 769 c = '\0'; 770 break; 771 } 772 773 switch (c) { 774 case ':': /* character class */ 775 NEXT2(); 776 (void)REQUIRE(MORE(), REG_EBRACK); 777 c = PEEK(); 778 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 779 p_b_cclass(p, cs); 780 (void)REQUIRE(MORE(), REG_EBRACK); 781 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 782 break; 783 case '=': /* equivalence class */ 784 NEXT2(); 785 (void)REQUIRE(MORE(), REG_EBRACK); 786 c = PEEK(); 787 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 788 p_b_eclass(p, cs); 789 (void)REQUIRE(MORE(), REG_EBRACK); 790 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 791 break; 792 default: /* symbol, ordinary character, or range */ 793/* xxx revision needed for multichar stuff */ 794 start = p_b_symbol(p); 795 if (SEE('-') && MORE2() && PEEK2() != ']') { 796 /* range */ 797 NEXT(); 798 if (EAT('-')) 799 finish = '-'; 800 else 801 finish = p_b_symbol(p); 802 } else 803 finish = start; 804 if (start == finish) 805 CHadd(cs, start); 806 else {
|
806 (void)REQUIRE(collate_range_cmp(start, finish) <= 0, REG_ERANGE);
| 807 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
|
807 for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
| 808 for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
|
808 if ( collate_range_cmp(start, i) <= 0 809 && collate_range_cmp(i, finish) <= 0
| 809 if ( __collate_range_cmp(start, i) <= 0 810 && __collate_range_cmp(i, finish) <= 0
|
810 ) 811 CHadd(cs, i); 812 } 813 } 814 break; 815 } 816} 817 818/* 819 - p_b_cclass - parse a character-class name and deal with it 820 == static void p_b_cclass(register struct parse *p, register cset *cs); 821 */ 822static void 823p_b_cclass(p, cs) 824register struct parse *p; 825register cset *cs; 826{ 827 register int c; 828 register char *sp = p->next; 829 register struct cclass *cp; 830 register size_t len; 831 832 while (MORE() && isalpha((uch)PEEK())) 833 NEXT(); 834 len = p->next - sp; 835 for (cp = cclasses; cp->name != NULL; cp++) 836 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 837 break; 838 if (cp->name == NULL) { 839 /* oops, didn't find it */ 840 SETERROR(REG_ECTYPE); 841 return; 842 } 843 844 switch (cp->fidx) { 845 case CALNUM: 846 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 847 if (isalnum((uch)c)) 848 CHadd(cs, c); 849 break; 850 case CALPHA: 851 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 852 if (isalpha((uch)c)) 853 CHadd(cs, c); 854 break; 855 case CBLANK: 856 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 857 if (isblank((uch)c)) 858 CHadd(cs, c); 859 break; 860 case CCNTRL: 861 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 862 if (iscntrl((uch)c)) 863 CHadd(cs, c); 864 break; 865 case CDIGIT: 866 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 867 if (isdigit((uch)c)) 868 CHadd(cs, c); 869 break; 870 case CGRAPH: 871 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 872 if (isgraph((uch)c)) 873 CHadd(cs, c); 874 break; 875 case CLOWER: 876 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 877 if (islower((uch)c)) 878 CHadd(cs, c); 879 break; 880 case CPRINT: 881 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 882 if (isprint((uch)c)) 883 CHadd(cs, c); 884 break; 885 case CPUNCT: 886 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 887 if (ispunct((uch)c)) 888 CHadd(cs, c); 889 break; 890 case CSPACE: 891 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 892 if (isspace((uch)c)) 893 CHadd(cs, c); 894 break; 895 case CUPPER: 896 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 897 if (isupper((uch)c)) 898 CHadd(cs, c); 899 break; 900 case CXDIGIT: 901 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 902 if (isxdigit((uch)c)) 903 CHadd(cs, c); 904 break; 905 } 906#if 0 907 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 908 MCadd(p, cs, u); 909#endif 910} 911 912/* 913 - p_b_eclass - parse an equivalence-class name and deal with it 914 == static void p_b_eclass(register struct parse *p, register cset *cs); 915 * 916 * This implementation is incomplete. xxx 917 */ 918static void 919p_b_eclass(p, cs) 920register struct parse *p; 921register cset *cs; 922{ 923 register char c; 924 925 c = p_b_coll_elem(p, '='); 926 CHadd(cs, c); 927} 928 929/* 930 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 931 == static char p_b_symbol(register struct parse *p); 932 */ 933static char /* value of symbol */ 934p_b_symbol(p) 935register struct parse *p; 936{ 937 register char value; 938 939 (void)REQUIRE(MORE(), REG_EBRACK); 940 if (!EATTWO('[', '.')) 941 return(GETNEXT()); 942 943 /* collating symbol */ 944 value = p_b_coll_elem(p, '.'); 945 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 946 return(value); 947} 948 949/* 950 - p_b_coll_elem - parse a collating-element name and look it up 951 == static char p_b_coll_elem(register struct parse *p, int endc); 952 */ 953static char /* value of collating element */ 954p_b_coll_elem(p, endc) 955register struct parse *p; 956int endc; /* name ended by endc,']' */ 957{ 958 register char *sp = p->next; 959 register struct cname *cp; 960 register int len; 961 962 while (MORE() && !SEETWO(endc, ']')) 963 NEXT(); 964 if (!MORE()) { 965 SETERROR(REG_EBRACK); 966 return(0); 967 } 968 len = p->next - sp; 969 for (cp = cnames; cp->name != NULL; cp++) 970 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 971 return(cp->code); /* known name */ 972 if (len == 1) 973 return(*sp); /* single character */ 974 SETERROR(REG_ECOLLATE); /* neither */ 975 return(0); 976} 977 978/* 979 - othercase - return the case counterpart of an alphabetic 980 == static char othercase(int ch); 981 */ 982static char /* if no counterpart, return ch */ 983othercase(ch) 984int ch; 985{ 986 ch = (unsigned char)ch; 987 assert(isalpha(ch)); 988 if (isupper(ch)) 989 return(tolower(ch)); 990 else if (islower(ch)) 991 return(toupper(ch)); 992 else /* peculiar, but could happen */ 993 return(ch); 994} 995 996/* 997 - bothcases - emit a dualcase version of a two-case character 998 == static void bothcases(register struct parse *p, int ch); 999 * 1000 * Boy, is this implementation ever a kludge... 1001 */ 1002static void 1003bothcases(p, ch) 1004register struct parse *p; 1005int ch; 1006{ 1007 register char *oldnext = p->next; 1008 register char *oldend = p->end; 1009 char bracket[3]; 1010 1011 ch = (unsigned char)ch; 1012 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1013 p->next = bracket; 1014 p->end = bracket+2; 1015 bracket[0] = ch; 1016 bracket[1] = ']'; 1017 bracket[2] = '\0'; 1018 p_bracket(p); 1019 assert(p->next == bracket+2); 1020 p->next = oldnext; 1021 p->end = oldend; 1022} 1023 1024/* 1025 - ordinary - emit an ordinary character 1026 == static void ordinary(register struct parse *p, register int ch); 1027 */ 1028static void 1029ordinary(p, ch) 1030register struct parse *p; 1031register int ch; 1032{ 1033 register cat_t *cap = p->g->categories; 1034 1035 if ((p->g->cflags®_ICASE) && isalpha((unsigned char)ch) && othercase(ch) != ch) 1036 bothcases(p, ch); 1037 else { 1038 EMIT(OCHAR, (unsigned char)ch); 1039 if (cap[ch] == 0) 1040 cap[ch] = p->g->ncategories++; 1041 } 1042} 1043 1044/* 1045 - nonnewline - emit REG_NEWLINE version of OANY 1046 == static void nonnewline(register struct parse *p); 1047 * 1048 * Boy, is this implementation ever a kludge... 1049 */ 1050static void 1051nonnewline(p) 1052register struct parse *p; 1053{ 1054 register char *oldnext = p->next; 1055 register char *oldend = p->end; 1056 char bracket[4]; 1057 1058 p->next = bracket; 1059 p->end = bracket+3; 1060 bracket[0] = '^'; 1061 bracket[1] = '\n'; 1062 bracket[2] = ']'; 1063 bracket[3] = '\0'; 1064 p_bracket(p); 1065 assert(p->next == bracket+3); 1066 p->next = oldnext; 1067 p->end = oldend; 1068} 1069 1070/* 1071 - repeat - generate code for a bounded repetition, recursively if needed 1072 == static void repeat(register struct parse *p, sopno start, int from, int to); 1073 */ 1074static void 1075repeat(p, start, from, to) 1076register struct parse *p; 1077sopno start; /* operand from here to end of strip */ 1078int from; /* repeated from this number */ 1079int to; /* to this number of times (maybe INFINITY) */ 1080{ 1081 register sopno finish = HERE(); 1082# define N 2 1083# define INF 3 1084# define REP(f, t) ((f)*8 + (t)) 1085# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1086 register sopno copy; 1087 1088 if (p->error != 0) /* head off possible runaway recursion */ 1089 return; 1090 1091 assert(from <= to); 1092 1093 switch (REP(MAP(from), MAP(to))) { 1094 case REP(0, 0): /* must be user doing this */ 1095 DROP(finish-start); /* drop the operand */ 1096 break; 1097 case REP(0, 1): /* as x{1,1}? */ 1098 case REP(0, N): /* as x{1,n}? */ 1099 case REP(0, INF): /* as x{1,}? */ 1100 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1101 INSERT(OCH_, start); /* offset is wrong... */ 1102 repeat(p, start+1, 1, to); 1103 ASTERN(OOR1, start); 1104 AHEAD(start); /* ... fix it */ 1105 EMIT(OOR2, 0); 1106 AHEAD(THERE()); 1107 ASTERN(O_CH, THERETHERE()); 1108 break; 1109 case REP(1, 1): /* trivial case */ 1110 /* done */ 1111 break; 1112 case REP(1, N): /* as x?x{1,n-1} */ 1113 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1114 INSERT(OCH_, start); 1115 ASTERN(OOR1, start); 1116 AHEAD(start); 1117 EMIT(OOR2, 0); /* offset very wrong... */ 1118 AHEAD(THERE()); /* ...so fix it */ 1119 ASTERN(O_CH, THERETHERE()); 1120 copy = dupl(p, start+1, finish+1); 1121 assert(copy == finish+4); 1122 repeat(p, copy, 1, to-1); 1123 break; 1124 case REP(1, INF): /* as x+ */ 1125 INSERT(OPLUS_, start); 1126 ASTERN(O_PLUS, start); 1127 break; 1128 case REP(N, N): /* as xx{m-1,n-1} */ 1129 copy = dupl(p, start, finish); 1130 repeat(p, copy, from-1, to-1); 1131 break; 1132 case REP(N, INF): /* as xx{n-1,INF} */ 1133 copy = dupl(p, start, finish); 1134 repeat(p, copy, from-1, to); 1135 break; 1136 default: /* "can't happen" */ 1137 SETERROR(REG_ASSERT); /* just in case */ 1138 break; 1139 } 1140} 1141 1142/* 1143 - seterr - set an error condition 1144 == static int seterr(register struct parse *p, int e); 1145 */ 1146static int /* useless but makes type checking happy */ 1147seterr(p, e) 1148register struct parse *p; 1149int e; 1150{ 1151 if (p->error == 0) /* keep earliest error condition */ 1152 p->error = e; 1153 p->next = nuls; /* try to bring things to a halt */ 1154 p->end = nuls; 1155 return(0); /* make the return value well-defined */ 1156} 1157 1158/* 1159 - allocset - allocate a set of characters for [] 1160 == static cset *allocset(register struct parse *p); 1161 */ 1162static cset * 1163allocset(p) 1164register struct parse *p; 1165{ 1166 register int no = p->g->ncsets++; 1167 register size_t nc; 1168 register size_t nbytes; 1169 register cset *cs; 1170 register size_t css = (size_t)p->g->csetsize; 1171 register int i; 1172 1173 if (no >= p->ncsalloc) { /* need another column of space */ 1174 p->ncsalloc += CHAR_BIT; 1175 nc = p->ncsalloc; 1176 assert(nc % CHAR_BIT == 0); 1177 nbytes = nc / CHAR_BIT * css; 1178 if (p->g->sets == NULL) 1179 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1180 else 1181 p->g->sets = (cset *)realloc((char *)p->g->sets, 1182 nc * sizeof(cset)); 1183 if (p->g->setbits == NULL) 1184 p->g->setbits = (uch *)malloc(nbytes); 1185 else { 1186 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1187 nbytes); 1188 /* xxx this isn't right if setbits is now NULL */ 1189 for (i = 0; i < no; i++) 1190 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1191 } 1192 if (p->g->sets != NULL && p->g->setbits != NULL) 1193 (void) memset((char *)p->g->setbits + (nbytes - css), 1194 0, css); 1195 else { 1196 no = 0; 1197 SETERROR(REG_ESPACE); 1198 /* caller's responsibility not to do set ops */ 1199 } 1200 } 1201 1202 assert(p->g->sets != NULL); /* xxx */ 1203 cs = &p->g->sets[no]; 1204 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1205 cs->mask = 1 << ((no) % CHAR_BIT); 1206 cs->hash = 0; 1207 cs->smultis = 0; 1208 cs->multis = NULL; 1209 1210 return(cs); 1211} 1212 1213/* 1214 - freeset - free a now-unused set 1215 == static void freeset(register struct parse *p, register cset *cs); 1216 */ 1217static void 1218freeset(p, cs) 1219register struct parse *p; 1220register cset *cs; 1221{ 1222 register int i; 1223 register cset *top = &p->g->sets[p->g->ncsets]; 1224 register size_t css = (size_t)p->g->csetsize; 1225 1226 for (i = 0; i < css; i++) 1227 CHsub(cs, i); 1228 if (cs == top-1) /* recover only the easy case */ 1229 p->g->ncsets--; 1230} 1231 1232/* 1233 - freezeset - final processing on a set of characters 1234 == static int freezeset(register struct parse *p, register cset *cs); 1235 * 1236 * The main task here is merging identical sets. This is usually a waste 1237 * of time (although the hash code minimizes the overhead), but can win 1238 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1239 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1240 * the same value! 1241 */ 1242static int /* set number */ 1243freezeset(p, cs) 1244register struct parse *p; 1245register cset *cs; 1246{ 1247 register short h = cs->hash; 1248 register int i; 1249 register cset *top = &p->g->sets[p->g->ncsets]; 1250 register cset *cs2; 1251 register size_t css = (size_t)p->g->csetsize; 1252 1253 /* look for an earlier one which is the same */ 1254 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1255 if (cs2->hash == h && cs2 != cs) { 1256 /* maybe */ 1257 for (i = 0; i < css; i++) 1258 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1259 break; /* no */ 1260 if (i == css) 1261 break; /* yes */ 1262 } 1263 1264 if (cs2 < top) { /* found one */ 1265 freeset(p, cs); 1266 cs = cs2; 1267 } 1268 1269 return((int)(cs - p->g->sets)); 1270} 1271 1272/* 1273 - firstch - return first character in a set (which must have at least one) 1274 == static int firstch(register struct parse *p, register cset *cs); 1275 */ 1276static int /* character; there is no "none" value */ 1277firstch(p, cs) 1278register struct parse *p; 1279register cset *cs; 1280{ 1281 register int i; 1282 register size_t css = (size_t)p->g->csetsize; 1283 1284 for (i = 0; i < css; i++) 1285 if (CHIN(cs, i)) 1286 return((unsigned char)i); 1287 assert(never); 1288 return(0); /* arbitrary */ 1289} 1290 1291/* 1292 - nch - number of characters in a set 1293 == static int nch(register struct parse *p, register cset *cs); 1294 */ 1295static int 1296nch(p, cs) 1297register struct parse *p; 1298register cset *cs; 1299{ 1300 register int i; 1301 register size_t css = (size_t)p->g->csetsize; 1302 register int n = 0; 1303 1304 for (i = 0; i < css; i++) 1305 if (CHIN(cs, i)) 1306 n++; 1307 return(n); 1308} 1309 1310/* 1311 - mcadd - add a collating element to a cset 1312 == static void mcadd(register struct parse *p, register cset *cs, \ 1313 == register char *cp); 1314 */ 1315static void 1316mcadd(p, cs, cp) 1317register struct parse *p; 1318register cset *cs; 1319register char *cp; 1320{ 1321 register size_t oldend = cs->smultis; 1322 1323 cs->smultis += strlen(cp) + 1; 1324 if (cs->multis == NULL) 1325 cs->multis = malloc(cs->smultis); 1326 else 1327 cs->multis = realloc(cs->multis, cs->smultis); 1328 if (cs->multis == NULL) { 1329 SETERROR(REG_ESPACE); 1330 return; 1331 } 1332 1333 (void) strcpy(cs->multis + oldend - 1, cp); 1334 cs->multis[cs->smultis - 1] = '\0'; 1335} 1336 1337#if used 1338/* 1339 - mcsub - subtract a collating element from a cset 1340 == static void mcsub(register cset *cs, register char *cp); 1341 */ 1342static void 1343mcsub(cs, cp) 1344register cset *cs; 1345register char *cp; 1346{ 1347 register char *fp = mcfind(cs, cp); 1348 register size_t len = strlen(fp); 1349 1350 assert(fp != NULL); 1351 (void) memmove(fp, fp + len + 1, 1352 cs->smultis - (fp + len + 1 - cs->multis)); 1353 cs->smultis -= len; 1354 1355 if (cs->smultis == 0) { 1356 free(cs->multis); 1357 cs->multis = NULL; 1358 return; 1359 } 1360 1361 cs->multis = realloc(cs->multis, cs->smultis); 1362 assert(cs->multis != NULL); 1363} 1364 1365/* 1366 - mcin - is a collating element in a cset? 1367 == static int mcin(register cset *cs, register char *cp); 1368 */ 1369static int 1370mcin(cs, cp) 1371register cset *cs; 1372register char *cp; 1373{ 1374 return(mcfind(cs, cp) != NULL); 1375} 1376 1377/* 1378 - mcfind - find a collating element in a cset 1379 == static char *mcfind(register cset *cs, register char *cp); 1380 */ 1381static char * 1382mcfind(cs, cp) 1383register cset *cs; 1384register char *cp; 1385{ 1386 register char *p; 1387 1388 if (cs->multis == NULL) 1389 return(NULL); 1390 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1391 if (strcmp(cp, p) == 0) 1392 return(p); 1393 return(NULL); 1394} 1395#endif 1396 1397/* 1398 - mcinvert - invert the list of collating elements in a cset 1399 == static void mcinvert(register struct parse *p, register cset *cs); 1400 * 1401 * This would have to know the set of possibilities. Implementation 1402 * is deferred. 1403 */ 1404static void 1405mcinvert(p, cs) 1406register struct parse *p; 1407register cset *cs; 1408{ 1409 assert(cs->multis == NULL); /* xxx */ 1410} 1411 1412/* 1413 - mccase - add case counterparts of the list of collating elements in a cset 1414 == static void mccase(register struct parse *p, register cset *cs); 1415 * 1416 * This would have to know the set of possibilities. Implementation 1417 * is deferred. 1418 */ 1419static void 1420mccase(p, cs) 1421register struct parse *p; 1422register cset *cs; 1423{ 1424 assert(cs->multis == NULL); /* xxx */ 1425} 1426 1427/* 1428 - isinsets - is this character in any sets? 1429 == static int isinsets(register struct re_guts *g, int c); 1430 */ 1431static int /* predicate */ 1432isinsets(g, c) 1433register struct re_guts *g; 1434int c; 1435{ 1436 register uch *col; 1437 register int i; 1438 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1439 register unsigned uc = (unsigned char)c; 1440 1441 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1442 if (col[uc] != 0) 1443 return(1); 1444 return(0); 1445} 1446 1447/* 1448 - samesets - are these two characters in exactly the same sets? 1449 == static int samesets(register struct re_guts *g, int c1, int c2); 1450 */ 1451static int /* predicate */ 1452samesets(g, c1, c2) 1453register struct re_guts *g; 1454int c1; 1455int c2; 1456{ 1457 register uch *col; 1458 register int i; 1459 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1460 register unsigned uc1 = (unsigned char)c1; 1461 register unsigned uc2 = (unsigned char)c2; 1462 1463 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1464 if (col[uc1] != col[uc2]) 1465 return(0); 1466 return(1); 1467} 1468 1469/* 1470 - categorize - sort out character categories 1471 == static void categorize(struct parse *p, register struct re_guts *g); 1472 */ 1473static void 1474categorize(p, g) 1475struct parse *p; 1476register struct re_guts *g; 1477{ 1478 register cat_t *cats = g->categories; 1479 register int c; 1480 register int c2; 1481 register cat_t cat; 1482 1483 /* avoid making error situations worse */ 1484 if (p->error != 0) 1485 return; 1486 1487 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1488 if (cats[c] == 0 && isinsets(g, c)) { 1489 cat = g->ncategories++; 1490 cats[c] = cat; 1491 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1492 if (cats[c2] == 0 && samesets(g, c, c2)) 1493 cats[c2] = cat; 1494 } 1495} 1496 1497/* 1498 - dupl - emit a duplicate of a bunch of sops 1499 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1500 */ 1501static sopno /* start of duplicate */ 1502dupl(p, start, finish) 1503register struct parse *p; 1504sopno start; /* from here */ 1505sopno finish; /* to this less one */ 1506{ 1507 register sopno ret = HERE(); 1508 register sopno len = finish - start; 1509 1510 assert(finish >= start); 1511 if (len == 0) 1512 return(ret); 1513 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1514 assert(p->ssize >= p->slen + len); 1515 (void) memcpy((char *)(p->strip + p->slen), 1516 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1517 p->slen += len; 1518 return(ret); 1519} 1520 1521/* 1522 - doemit - emit a strip operator 1523 == static void doemit(register struct parse *p, sop op, size_t opnd); 1524 * 1525 * It might seem better to implement this as a macro with a function as 1526 * hard-case backup, but it's just too big and messy unless there are 1527 * some changes to the data structures. Maybe later. 1528 */ 1529static void 1530doemit(p, op, opnd) 1531register struct parse *p; 1532sop op; 1533size_t opnd; 1534{ 1535 /* avoid making error situations worse */ 1536 if (p->error != 0) 1537 return; 1538 1539 /* deal with oversize operands ("can't happen", more or less) */ 1540 assert(opnd < 1<<OPSHIFT); 1541 1542 /* deal with undersized strip */ 1543 if (p->slen >= p->ssize) 1544 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1545 assert(p->slen < p->ssize); 1546 1547 /* finally, it's all reduced to the easy case */ 1548 p->strip[p->slen++] = SOP(op, opnd); 1549} 1550 1551/* 1552 - doinsert - insert a sop into the strip 1553 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1554 */ 1555static void 1556doinsert(p, op, opnd, pos) 1557register struct parse *p; 1558sop op; 1559size_t opnd; 1560sopno pos; 1561{ 1562 register sopno sn; 1563 register sop s; 1564 register int i; 1565 1566 /* avoid making error situations worse */ 1567 if (p->error != 0) 1568 return; 1569 1570 sn = HERE(); 1571 EMIT(op, opnd); /* do checks, ensure space */ 1572 assert(HERE() == sn+1); 1573 s = p->strip[sn]; 1574 1575 /* adjust paren pointers */ 1576 assert(pos > 0); 1577 for (i = 1; i < NPAREN; i++) { 1578 if (p->pbegin[i] >= pos) { 1579 p->pbegin[i]++; 1580 } 1581 if (p->pend[i] >= pos) { 1582 p->pend[i]++; 1583 } 1584 } 1585 1586 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1587 (HERE()-pos-1)*sizeof(sop)); 1588 p->strip[pos] = s; 1589} 1590 1591/* 1592 - dofwd - complete a forward reference 1593 == static void dofwd(register struct parse *p, sopno pos, sop value); 1594 */ 1595static void 1596dofwd(p, pos, value) 1597register struct parse *p; 1598register sopno pos; 1599sop value; 1600{ 1601 /* avoid making error situations worse */ 1602 if (p->error != 0) 1603 return; 1604 1605 assert(value < 1<<OPSHIFT); 1606 p->strip[pos] = OP(p->strip[pos]) | value; 1607} 1608 1609/* 1610 - enlarge - enlarge the strip 1611 == static void enlarge(register struct parse *p, sopno size); 1612 */ 1613static void 1614enlarge(p, size) 1615register struct parse *p; 1616register sopno size; 1617{ 1618 register sop *sp; 1619 1620 if (p->ssize >= size) 1621 return; 1622 1623 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1624 if (sp == NULL) { 1625 SETERROR(REG_ESPACE); 1626 return; 1627 } 1628 p->strip = sp; 1629 p->ssize = size; 1630} 1631 1632/* 1633 - stripsnug - compact the strip 1634 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1635 */ 1636static void 1637stripsnug(p, g) 1638register struct parse *p; 1639register struct re_guts *g; 1640{ 1641 g->nstates = p->slen; 1642 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1643 if (g->strip == NULL) { 1644 SETERROR(REG_ESPACE); 1645 g->strip = p->strip; 1646 } 1647} 1648 1649/* 1650 - findmust - fill in must and mlen with longest mandatory literal string 1651 == static void findmust(register struct parse *p, register struct re_guts *g); 1652 * 1653 * This algorithm could do fancy things like analyzing the operands of | 1654 * for common subsequences. Someday. This code is simple and finds most 1655 * of the interesting cases. 1656 * 1657 * Note that must and mlen got initialized during setup. 1658 */ 1659static void 1660findmust(p, g) 1661struct parse *p; 1662register struct re_guts *g; 1663{ 1664 register sop *scan; 1665 sop *start; 1666 register sop *newstart; 1667 register sopno newlen; 1668 register sop s; 1669 register char *cp; 1670 register sopno i; 1671 1672 /* avoid making error situations worse */ 1673 if (p->error != 0) 1674 return; 1675 1676 /* find the longest OCHAR sequence in strip */ 1677 newlen = 0; 1678 scan = g->strip + 1; 1679 do { 1680 s = *scan++; 1681 switch (OP(s)) { 1682 case OCHAR: /* sequence member */ 1683 if (newlen == 0) /* new sequence */ 1684 newstart = scan - 1; 1685 newlen++; 1686 break; 1687 case OPLUS_: /* things that don't break one */ 1688 case OLPAREN: 1689 case ORPAREN: 1690 break; 1691 case OQUEST_: /* things that must be skipped */ 1692 case OCH_: 1693 scan--; 1694 do { 1695 scan += OPND(s); 1696 s = *scan; 1697 /* assert() interferes w debug printouts */ 1698 if (OP(s) != O_QUEST && OP(s) != O_CH && 1699 OP(s) != OOR2) { 1700 g->iflags |= BAD; 1701 return; 1702 } 1703 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1704 /* fallthrough */ 1705 default: /* things that break a sequence */ 1706 if (newlen > g->mlen) { /* ends one */ 1707 start = newstart; 1708 g->mlen = newlen; 1709 } 1710 newlen = 0; 1711 break; 1712 } 1713 } while (OP(s) != OEND); 1714 1715 if (g->mlen == 0) /* there isn't one */ 1716 return; 1717 1718 /* turn it into a character string */ 1719 g->must = malloc((size_t)g->mlen + 1); 1720 if (g->must == NULL) { /* argh; just forget it */ 1721 g->mlen = 0; 1722 return; 1723 } 1724 cp = g->must; 1725 scan = start; 1726 for (i = g->mlen; i > 0; i--) { 1727 while (OP(s = *scan++) != OCHAR) 1728 continue; 1729 assert(cp < g->must + g->mlen); 1730 *cp++ = (char)OPND(s); 1731 } 1732 assert(cp == g->must + g->mlen); 1733 *cp++ = '\0'; /* just on general principles */ 1734} 1735 1736/* 1737 - pluscount - count + nesting 1738 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1739 */ 1740static sopno /* nesting depth */ 1741pluscount(p, g) 1742struct parse *p; 1743register struct re_guts *g; 1744{ 1745 register sop *scan; 1746 register sop s; 1747 register sopno plusnest = 0; 1748 register sopno maxnest = 0; 1749 1750 if (p->error != 0) 1751 return(0); /* there may not be an OEND */ 1752 1753 scan = g->strip + 1; 1754 do { 1755 s = *scan++; 1756 switch (OP(s)) { 1757 case OPLUS_: 1758 plusnest++; 1759 break; 1760 case O_PLUS: 1761 if (plusnest > maxnest) 1762 maxnest = plusnest; 1763 plusnest--; 1764 break; 1765 } 1766 } while (OP(s) != OEND); 1767 if (plusnest != 0) 1768 g->iflags |= BAD; 1769 return(maxnest); 1770}
| 811 ) 812 CHadd(cs, i); 813 } 814 } 815 break; 816 } 817} 818 819/* 820 - p_b_cclass - parse a character-class name and deal with it 821 == static void p_b_cclass(register struct parse *p, register cset *cs); 822 */ 823static void 824p_b_cclass(p, cs) 825register struct parse *p; 826register cset *cs; 827{ 828 register int c; 829 register char *sp = p->next; 830 register struct cclass *cp; 831 register size_t len; 832 833 while (MORE() && isalpha((uch)PEEK())) 834 NEXT(); 835 len = p->next - sp; 836 for (cp = cclasses; cp->name != NULL; cp++) 837 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 838 break; 839 if (cp->name == NULL) { 840 /* oops, didn't find it */ 841 SETERROR(REG_ECTYPE); 842 return; 843 } 844 845 switch (cp->fidx) { 846 case CALNUM: 847 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 848 if (isalnum((uch)c)) 849 CHadd(cs, c); 850 break; 851 case CALPHA: 852 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 853 if (isalpha((uch)c)) 854 CHadd(cs, c); 855 break; 856 case CBLANK: 857 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 858 if (isblank((uch)c)) 859 CHadd(cs, c); 860 break; 861 case CCNTRL: 862 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 863 if (iscntrl((uch)c)) 864 CHadd(cs, c); 865 break; 866 case CDIGIT: 867 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 868 if (isdigit((uch)c)) 869 CHadd(cs, c); 870 break; 871 case CGRAPH: 872 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 873 if (isgraph((uch)c)) 874 CHadd(cs, c); 875 break; 876 case CLOWER: 877 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 878 if (islower((uch)c)) 879 CHadd(cs, c); 880 break; 881 case CPRINT: 882 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 883 if (isprint((uch)c)) 884 CHadd(cs, c); 885 break; 886 case CPUNCT: 887 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 888 if (ispunct((uch)c)) 889 CHadd(cs, c); 890 break; 891 case CSPACE: 892 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 893 if (isspace((uch)c)) 894 CHadd(cs, c); 895 break; 896 case CUPPER: 897 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 898 if (isupper((uch)c)) 899 CHadd(cs, c); 900 break; 901 case CXDIGIT: 902 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 903 if (isxdigit((uch)c)) 904 CHadd(cs, c); 905 break; 906 } 907#if 0 908 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 909 MCadd(p, cs, u); 910#endif 911} 912 913/* 914 - p_b_eclass - parse an equivalence-class name and deal with it 915 == static void p_b_eclass(register struct parse *p, register cset *cs); 916 * 917 * This implementation is incomplete. xxx 918 */ 919static void 920p_b_eclass(p, cs) 921register struct parse *p; 922register cset *cs; 923{ 924 register char c; 925 926 c = p_b_coll_elem(p, '='); 927 CHadd(cs, c); 928} 929 930/* 931 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 932 == static char p_b_symbol(register struct parse *p); 933 */ 934static char /* value of symbol */ 935p_b_symbol(p) 936register struct parse *p; 937{ 938 register char value; 939 940 (void)REQUIRE(MORE(), REG_EBRACK); 941 if (!EATTWO('[', '.')) 942 return(GETNEXT()); 943 944 /* collating symbol */ 945 value = p_b_coll_elem(p, '.'); 946 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 947 return(value); 948} 949 950/* 951 - p_b_coll_elem - parse a collating-element name and look it up 952 == static char p_b_coll_elem(register struct parse *p, int endc); 953 */ 954static char /* value of collating element */ 955p_b_coll_elem(p, endc) 956register struct parse *p; 957int endc; /* name ended by endc,']' */ 958{ 959 register char *sp = p->next; 960 register struct cname *cp; 961 register int len; 962 963 while (MORE() && !SEETWO(endc, ']')) 964 NEXT(); 965 if (!MORE()) { 966 SETERROR(REG_EBRACK); 967 return(0); 968 } 969 len = p->next - sp; 970 for (cp = cnames; cp->name != NULL; cp++) 971 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 972 return(cp->code); /* known name */ 973 if (len == 1) 974 return(*sp); /* single character */ 975 SETERROR(REG_ECOLLATE); /* neither */ 976 return(0); 977} 978 979/* 980 - othercase - return the case counterpart of an alphabetic 981 == static char othercase(int ch); 982 */ 983static char /* if no counterpart, return ch */ 984othercase(ch) 985int ch; 986{ 987 ch = (unsigned char)ch; 988 assert(isalpha(ch)); 989 if (isupper(ch)) 990 return(tolower(ch)); 991 else if (islower(ch)) 992 return(toupper(ch)); 993 else /* peculiar, but could happen */ 994 return(ch); 995} 996 997/* 998 - bothcases - emit a dualcase version of a two-case character 999 == static void bothcases(register struct parse *p, int ch); 1000 * 1001 * Boy, is this implementation ever a kludge... 1002 */ 1003static void 1004bothcases(p, ch) 1005register struct parse *p; 1006int ch; 1007{ 1008 register char *oldnext = p->next; 1009 register char *oldend = p->end; 1010 char bracket[3]; 1011 1012 ch = (unsigned char)ch; 1013 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1014 p->next = bracket; 1015 p->end = bracket+2; 1016 bracket[0] = ch; 1017 bracket[1] = ']'; 1018 bracket[2] = '\0'; 1019 p_bracket(p); 1020 assert(p->next == bracket+2); 1021 p->next = oldnext; 1022 p->end = oldend; 1023} 1024 1025/* 1026 - ordinary - emit an ordinary character 1027 == static void ordinary(register struct parse *p, register int ch); 1028 */ 1029static void 1030ordinary(p, ch) 1031register struct parse *p; 1032register int ch; 1033{ 1034 register cat_t *cap = p->g->categories; 1035 1036 if ((p->g->cflags®_ICASE) && isalpha((unsigned char)ch) && othercase(ch) != ch) 1037 bothcases(p, ch); 1038 else { 1039 EMIT(OCHAR, (unsigned char)ch); 1040 if (cap[ch] == 0) 1041 cap[ch] = p->g->ncategories++; 1042 } 1043} 1044 1045/* 1046 - nonnewline - emit REG_NEWLINE version of OANY 1047 == static void nonnewline(register struct parse *p); 1048 * 1049 * Boy, is this implementation ever a kludge... 1050 */ 1051static void 1052nonnewline(p) 1053register struct parse *p; 1054{ 1055 register char *oldnext = p->next; 1056 register char *oldend = p->end; 1057 char bracket[4]; 1058 1059 p->next = bracket; 1060 p->end = bracket+3; 1061 bracket[0] = '^'; 1062 bracket[1] = '\n'; 1063 bracket[2] = ']'; 1064 bracket[3] = '\0'; 1065 p_bracket(p); 1066 assert(p->next == bracket+3); 1067 p->next = oldnext; 1068 p->end = oldend; 1069} 1070 1071/* 1072 - repeat - generate code for a bounded repetition, recursively if needed 1073 == static void repeat(register struct parse *p, sopno start, int from, int to); 1074 */ 1075static void 1076repeat(p, start, from, to) 1077register struct parse *p; 1078sopno start; /* operand from here to end of strip */ 1079int from; /* repeated from this number */ 1080int to; /* to this number of times (maybe INFINITY) */ 1081{ 1082 register sopno finish = HERE(); 1083# define N 2 1084# define INF 3 1085# define REP(f, t) ((f)*8 + (t)) 1086# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1087 register sopno copy; 1088 1089 if (p->error != 0) /* head off possible runaway recursion */ 1090 return; 1091 1092 assert(from <= to); 1093 1094 switch (REP(MAP(from), MAP(to))) { 1095 case REP(0, 0): /* must be user doing this */ 1096 DROP(finish-start); /* drop the operand */ 1097 break; 1098 case REP(0, 1): /* as x{1,1}? */ 1099 case REP(0, N): /* as x{1,n}? */ 1100 case REP(0, INF): /* as x{1,}? */ 1101 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1102 INSERT(OCH_, start); /* offset is wrong... */ 1103 repeat(p, start+1, 1, to); 1104 ASTERN(OOR1, start); 1105 AHEAD(start); /* ... fix it */ 1106 EMIT(OOR2, 0); 1107 AHEAD(THERE()); 1108 ASTERN(O_CH, THERETHERE()); 1109 break; 1110 case REP(1, 1): /* trivial case */ 1111 /* done */ 1112 break; 1113 case REP(1, N): /* as x?x{1,n-1} */ 1114 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1115 INSERT(OCH_, start); 1116 ASTERN(OOR1, start); 1117 AHEAD(start); 1118 EMIT(OOR2, 0); /* offset very wrong... */ 1119 AHEAD(THERE()); /* ...so fix it */ 1120 ASTERN(O_CH, THERETHERE()); 1121 copy = dupl(p, start+1, finish+1); 1122 assert(copy == finish+4); 1123 repeat(p, copy, 1, to-1); 1124 break; 1125 case REP(1, INF): /* as x+ */ 1126 INSERT(OPLUS_, start); 1127 ASTERN(O_PLUS, start); 1128 break; 1129 case REP(N, N): /* as xx{m-1,n-1} */ 1130 copy = dupl(p, start, finish); 1131 repeat(p, copy, from-1, to-1); 1132 break; 1133 case REP(N, INF): /* as xx{n-1,INF} */ 1134 copy = dupl(p, start, finish); 1135 repeat(p, copy, from-1, to); 1136 break; 1137 default: /* "can't happen" */ 1138 SETERROR(REG_ASSERT); /* just in case */ 1139 break; 1140 } 1141} 1142 1143/* 1144 - seterr - set an error condition 1145 == static int seterr(register struct parse *p, int e); 1146 */ 1147static int /* useless but makes type checking happy */ 1148seterr(p, e) 1149register struct parse *p; 1150int e; 1151{ 1152 if (p->error == 0) /* keep earliest error condition */ 1153 p->error = e; 1154 p->next = nuls; /* try to bring things to a halt */ 1155 p->end = nuls; 1156 return(0); /* make the return value well-defined */ 1157} 1158 1159/* 1160 - allocset - allocate a set of characters for [] 1161 == static cset *allocset(register struct parse *p); 1162 */ 1163static cset * 1164allocset(p) 1165register struct parse *p; 1166{ 1167 register int no = p->g->ncsets++; 1168 register size_t nc; 1169 register size_t nbytes; 1170 register cset *cs; 1171 register size_t css = (size_t)p->g->csetsize; 1172 register int i; 1173 1174 if (no >= p->ncsalloc) { /* need another column of space */ 1175 p->ncsalloc += CHAR_BIT; 1176 nc = p->ncsalloc; 1177 assert(nc % CHAR_BIT == 0); 1178 nbytes = nc / CHAR_BIT * css; 1179 if (p->g->sets == NULL) 1180 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1181 else 1182 p->g->sets = (cset *)realloc((char *)p->g->sets, 1183 nc * sizeof(cset)); 1184 if (p->g->setbits == NULL) 1185 p->g->setbits = (uch *)malloc(nbytes); 1186 else { 1187 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1188 nbytes); 1189 /* xxx this isn't right if setbits is now NULL */ 1190 for (i = 0; i < no; i++) 1191 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1192 } 1193 if (p->g->sets != NULL && p->g->setbits != NULL) 1194 (void) memset((char *)p->g->setbits + (nbytes - css), 1195 0, css); 1196 else { 1197 no = 0; 1198 SETERROR(REG_ESPACE); 1199 /* caller's responsibility not to do set ops */ 1200 } 1201 } 1202 1203 assert(p->g->sets != NULL); /* xxx */ 1204 cs = &p->g->sets[no]; 1205 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1206 cs->mask = 1 << ((no) % CHAR_BIT); 1207 cs->hash = 0; 1208 cs->smultis = 0; 1209 cs->multis = NULL; 1210 1211 return(cs); 1212} 1213 1214/* 1215 - freeset - free a now-unused set 1216 == static void freeset(register struct parse *p, register cset *cs); 1217 */ 1218static void 1219freeset(p, cs) 1220register struct parse *p; 1221register cset *cs; 1222{ 1223 register int i; 1224 register cset *top = &p->g->sets[p->g->ncsets]; 1225 register size_t css = (size_t)p->g->csetsize; 1226 1227 for (i = 0; i < css; i++) 1228 CHsub(cs, i); 1229 if (cs == top-1) /* recover only the easy case */ 1230 p->g->ncsets--; 1231} 1232 1233/* 1234 - freezeset - final processing on a set of characters 1235 == static int freezeset(register struct parse *p, register cset *cs); 1236 * 1237 * The main task here is merging identical sets. This is usually a waste 1238 * of time (although the hash code minimizes the overhead), but can win 1239 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1240 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1241 * the same value! 1242 */ 1243static int /* set number */ 1244freezeset(p, cs) 1245register struct parse *p; 1246register cset *cs; 1247{ 1248 register short h = cs->hash; 1249 register int i; 1250 register cset *top = &p->g->sets[p->g->ncsets]; 1251 register cset *cs2; 1252 register size_t css = (size_t)p->g->csetsize; 1253 1254 /* look for an earlier one which is the same */ 1255 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1256 if (cs2->hash == h && cs2 != cs) { 1257 /* maybe */ 1258 for (i = 0; i < css; i++) 1259 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1260 break; /* no */ 1261 if (i == css) 1262 break; /* yes */ 1263 } 1264 1265 if (cs2 < top) { /* found one */ 1266 freeset(p, cs); 1267 cs = cs2; 1268 } 1269 1270 return((int)(cs - p->g->sets)); 1271} 1272 1273/* 1274 - firstch - return first character in a set (which must have at least one) 1275 == static int firstch(register struct parse *p, register cset *cs); 1276 */ 1277static int /* character; there is no "none" value */ 1278firstch(p, cs) 1279register struct parse *p; 1280register cset *cs; 1281{ 1282 register int i; 1283 register size_t css = (size_t)p->g->csetsize; 1284 1285 for (i = 0; i < css; i++) 1286 if (CHIN(cs, i)) 1287 return((unsigned char)i); 1288 assert(never); 1289 return(0); /* arbitrary */ 1290} 1291 1292/* 1293 - nch - number of characters in a set 1294 == static int nch(register struct parse *p, register cset *cs); 1295 */ 1296static int 1297nch(p, cs) 1298register struct parse *p; 1299register cset *cs; 1300{ 1301 register int i; 1302 register size_t css = (size_t)p->g->csetsize; 1303 register int n = 0; 1304 1305 for (i = 0; i < css; i++) 1306 if (CHIN(cs, i)) 1307 n++; 1308 return(n); 1309} 1310 1311/* 1312 - mcadd - add a collating element to a cset 1313 == static void mcadd(register struct parse *p, register cset *cs, \ 1314 == register char *cp); 1315 */ 1316static void 1317mcadd(p, cs, cp) 1318register struct parse *p; 1319register cset *cs; 1320register char *cp; 1321{ 1322 register size_t oldend = cs->smultis; 1323 1324 cs->smultis += strlen(cp) + 1; 1325 if (cs->multis == NULL) 1326 cs->multis = malloc(cs->smultis); 1327 else 1328 cs->multis = realloc(cs->multis, cs->smultis); 1329 if (cs->multis == NULL) { 1330 SETERROR(REG_ESPACE); 1331 return; 1332 } 1333 1334 (void) strcpy(cs->multis + oldend - 1, cp); 1335 cs->multis[cs->smultis - 1] = '\0'; 1336} 1337 1338#if used 1339/* 1340 - mcsub - subtract a collating element from a cset 1341 == static void mcsub(register cset *cs, register char *cp); 1342 */ 1343static void 1344mcsub(cs, cp) 1345register cset *cs; 1346register char *cp; 1347{ 1348 register char *fp = mcfind(cs, cp); 1349 register size_t len = strlen(fp); 1350 1351 assert(fp != NULL); 1352 (void) memmove(fp, fp + len + 1, 1353 cs->smultis - (fp + len + 1 - cs->multis)); 1354 cs->smultis -= len; 1355 1356 if (cs->smultis == 0) { 1357 free(cs->multis); 1358 cs->multis = NULL; 1359 return; 1360 } 1361 1362 cs->multis = realloc(cs->multis, cs->smultis); 1363 assert(cs->multis != NULL); 1364} 1365 1366/* 1367 - mcin - is a collating element in a cset? 1368 == static int mcin(register cset *cs, register char *cp); 1369 */ 1370static int 1371mcin(cs, cp) 1372register cset *cs; 1373register char *cp; 1374{ 1375 return(mcfind(cs, cp) != NULL); 1376} 1377 1378/* 1379 - mcfind - find a collating element in a cset 1380 == static char *mcfind(register cset *cs, register char *cp); 1381 */ 1382static char * 1383mcfind(cs, cp) 1384register cset *cs; 1385register char *cp; 1386{ 1387 register char *p; 1388 1389 if (cs->multis == NULL) 1390 return(NULL); 1391 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1392 if (strcmp(cp, p) == 0) 1393 return(p); 1394 return(NULL); 1395} 1396#endif 1397 1398/* 1399 - mcinvert - invert the list of collating elements in a cset 1400 == static void mcinvert(register struct parse *p, register cset *cs); 1401 * 1402 * This would have to know the set of possibilities. Implementation 1403 * is deferred. 1404 */ 1405static void 1406mcinvert(p, cs) 1407register struct parse *p; 1408register cset *cs; 1409{ 1410 assert(cs->multis == NULL); /* xxx */ 1411} 1412 1413/* 1414 - mccase - add case counterparts of the list of collating elements in a cset 1415 == static void mccase(register struct parse *p, register cset *cs); 1416 * 1417 * This would have to know the set of possibilities. Implementation 1418 * is deferred. 1419 */ 1420static void 1421mccase(p, cs) 1422register struct parse *p; 1423register cset *cs; 1424{ 1425 assert(cs->multis == NULL); /* xxx */ 1426} 1427 1428/* 1429 - isinsets - is this character in any sets? 1430 == static int isinsets(register struct re_guts *g, int c); 1431 */ 1432static int /* predicate */ 1433isinsets(g, c) 1434register struct re_guts *g; 1435int c; 1436{ 1437 register uch *col; 1438 register int i; 1439 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1440 register unsigned uc = (unsigned char)c; 1441 1442 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1443 if (col[uc] != 0) 1444 return(1); 1445 return(0); 1446} 1447 1448/* 1449 - samesets - are these two characters in exactly the same sets? 1450 == static int samesets(register struct re_guts *g, int c1, int c2); 1451 */ 1452static int /* predicate */ 1453samesets(g, c1, c2) 1454register struct re_guts *g; 1455int c1; 1456int c2; 1457{ 1458 register uch *col; 1459 register int i; 1460 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1461 register unsigned uc1 = (unsigned char)c1; 1462 register unsigned uc2 = (unsigned char)c2; 1463 1464 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1465 if (col[uc1] != col[uc2]) 1466 return(0); 1467 return(1); 1468} 1469 1470/* 1471 - categorize - sort out character categories 1472 == static void categorize(struct parse *p, register struct re_guts *g); 1473 */ 1474static void 1475categorize(p, g) 1476struct parse *p; 1477register struct re_guts *g; 1478{ 1479 register cat_t *cats = g->categories; 1480 register int c; 1481 register int c2; 1482 register cat_t cat; 1483 1484 /* avoid making error situations worse */ 1485 if (p->error != 0) 1486 return; 1487 1488 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1489 if (cats[c] == 0 && isinsets(g, c)) { 1490 cat = g->ncategories++; 1491 cats[c] = cat; 1492 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1493 if (cats[c2] == 0 && samesets(g, c, c2)) 1494 cats[c2] = cat; 1495 } 1496} 1497 1498/* 1499 - dupl - emit a duplicate of a bunch of sops 1500 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1501 */ 1502static sopno /* start of duplicate */ 1503dupl(p, start, finish) 1504register struct parse *p; 1505sopno start; /* from here */ 1506sopno finish; /* to this less one */ 1507{ 1508 register sopno ret = HERE(); 1509 register sopno len = finish - start; 1510 1511 assert(finish >= start); 1512 if (len == 0) 1513 return(ret); 1514 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1515 assert(p->ssize >= p->slen + len); 1516 (void) memcpy((char *)(p->strip + p->slen), 1517 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1518 p->slen += len; 1519 return(ret); 1520} 1521 1522/* 1523 - doemit - emit a strip operator 1524 == static void doemit(register struct parse *p, sop op, size_t opnd); 1525 * 1526 * It might seem better to implement this as a macro with a function as 1527 * hard-case backup, but it's just too big and messy unless there are 1528 * some changes to the data structures. Maybe later. 1529 */ 1530static void 1531doemit(p, op, opnd) 1532register struct parse *p; 1533sop op; 1534size_t opnd; 1535{ 1536 /* avoid making error situations worse */ 1537 if (p->error != 0) 1538 return; 1539 1540 /* deal with oversize operands ("can't happen", more or less) */ 1541 assert(opnd < 1<<OPSHIFT); 1542 1543 /* deal with undersized strip */ 1544 if (p->slen >= p->ssize) 1545 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1546 assert(p->slen < p->ssize); 1547 1548 /* finally, it's all reduced to the easy case */ 1549 p->strip[p->slen++] = SOP(op, opnd); 1550} 1551 1552/* 1553 - doinsert - insert a sop into the strip 1554 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1555 */ 1556static void 1557doinsert(p, op, opnd, pos) 1558register struct parse *p; 1559sop op; 1560size_t opnd; 1561sopno pos; 1562{ 1563 register sopno sn; 1564 register sop s; 1565 register int i; 1566 1567 /* avoid making error situations worse */ 1568 if (p->error != 0) 1569 return; 1570 1571 sn = HERE(); 1572 EMIT(op, opnd); /* do checks, ensure space */ 1573 assert(HERE() == sn+1); 1574 s = p->strip[sn]; 1575 1576 /* adjust paren pointers */ 1577 assert(pos > 0); 1578 for (i = 1; i < NPAREN; i++) { 1579 if (p->pbegin[i] >= pos) { 1580 p->pbegin[i]++; 1581 } 1582 if (p->pend[i] >= pos) { 1583 p->pend[i]++; 1584 } 1585 } 1586 1587 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1588 (HERE()-pos-1)*sizeof(sop)); 1589 p->strip[pos] = s; 1590} 1591 1592/* 1593 - dofwd - complete a forward reference 1594 == static void dofwd(register struct parse *p, sopno pos, sop value); 1595 */ 1596static void 1597dofwd(p, pos, value) 1598register struct parse *p; 1599register sopno pos; 1600sop value; 1601{ 1602 /* avoid making error situations worse */ 1603 if (p->error != 0) 1604 return; 1605 1606 assert(value < 1<<OPSHIFT); 1607 p->strip[pos] = OP(p->strip[pos]) | value; 1608} 1609 1610/* 1611 - enlarge - enlarge the strip 1612 == static void enlarge(register struct parse *p, sopno size); 1613 */ 1614static void 1615enlarge(p, size) 1616register struct parse *p; 1617register sopno size; 1618{ 1619 register sop *sp; 1620 1621 if (p->ssize >= size) 1622 return; 1623 1624 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1625 if (sp == NULL) { 1626 SETERROR(REG_ESPACE); 1627 return; 1628 } 1629 p->strip = sp; 1630 p->ssize = size; 1631} 1632 1633/* 1634 - stripsnug - compact the strip 1635 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1636 */ 1637static void 1638stripsnug(p, g) 1639register struct parse *p; 1640register struct re_guts *g; 1641{ 1642 g->nstates = p->slen; 1643 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1644 if (g->strip == NULL) { 1645 SETERROR(REG_ESPACE); 1646 g->strip = p->strip; 1647 } 1648} 1649 1650/* 1651 - findmust - fill in must and mlen with longest mandatory literal string 1652 == static void findmust(register struct parse *p, register struct re_guts *g); 1653 * 1654 * This algorithm could do fancy things like analyzing the operands of | 1655 * for common subsequences. Someday. This code is simple and finds most 1656 * of the interesting cases. 1657 * 1658 * Note that must and mlen got initialized during setup. 1659 */ 1660static void 1661findmust(p, g) 1662struct parse *p; 1663register struct re_guts *g; 1664{ 1665 register sop *scan; 1666 sop *start; 1667 register sop *newstart; 1668 register sopno newlen; 1669 register sop s; 1670 register char *cp; 1671 register sopno i; 1672 1673 /* avoid making error situations worse */ 1674 if (p->error != 0) 1675 return; 1676 1677 /* find the longest OCHAR sequence in strip */ 1678 newlen = 0; 1679 scan = g->strip + 1; 1680 do { 1681 s = *scan++; 1682 switch (OP(s)) { 1683 case OCHAR: /* sequence member */ 1684 if (newlen == 0) /* new sequence */ 1685 newstart = scan - 1; 1686 newlen++; 1687 break; 1688 case OPLUS_: /* things that don't break one */ 1689 case OLPAREN: 1690 case ORPAREN: 1691 break; 1692 case OQUEST_: /* things that must be skipped */ 1693 case OCH_: 1694 scan--; 1695 do { 1696 scan += OPND(s); 1697 s = *scan; 1698 /* assert() interferes w debug printouts */ 1699 if (OP(s) != O_QUEST && OP(s) != O_CH && 1700 OP(s) != OOR2) { 1701 g->iflags |= BAD; 1702 return; 1703 } 1704 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1705 /* fallthrough */ 1706 default: /* things that break a sequence */ 1707 if (newlen > g->mlen) { /* ends one */ 1708 start = newstart; 1709 g->mlen = newlen; 1710 } 1711 newlen = 0; 1712 break; 1713 } 1714 } while (OP(s) != OEND); 1715 1716 if (g->mlen == 0) /* there isn't one */ 1717 return; 1718 1719 /* turn it into a character string */ 1720 g->must = malloc((size_t)g->mlen + 1); 1721 if (g->must == NULL) { /* argh; just forget it */ 1722 g->mlen = 0; 1723 return; 1724 } 1725 cp = g->must; 1726 scan = start; 1727 for (i = g->mlen; i > 0; i--) { 1728 while (OP(s = *scan++) != OCHAR) 1729 continue; 1730 assert(cp < g->must + g->mlen); 1731 *cp++ = (char)OPND(s); 1732 } 1733 assert(cp == g->must + g->mlen); 1734 *cp++ = '\0'; /* just on general principles */ 1735} 1736 1737/* 1738 - pluscount - count + nesting 1739 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1740 */ 1741static sopno /* nesting depth */ 1742pluscount(p, g) 1743struct parse *p; 1744register struct re_guts *g; 1745{ 1746 register sop *scan; 1747 register sop s; 1748 register sopno plusnest = 0; 1749 register sopno maxnest = 0; 1750 1751 if (p->error != 0) 1752 return(0); /* there may not be an OEND */ 1753 1754 scan = g->strip + 1; 1755 do { 1756 s = *scan++; 1757 switch (OP(s)) { 1758 case OPLUS_: 1759 plusnest++; 1760 break; 1761 case O_PLUS: 1762 if (plusnest > maxnest) 1763 maxnest = plusnest; 1764 plusnest--; 1765 break; 1766 } 1767 } while (OP(s) != OEND); 1768 if (plusnest != 0) 1769 g->iflags |= BAD; 1770 return(maxnest); 1771}
|