118334Speter/* $NetBSD: regcomp.c,v 1.8 2021/05/17 04:01:57 rin Exp $ */ 218334Speter/*- 318334Speter * Copyright (c) 1992, 1993, 1994 Henry Spencer. 418334Speter * Copyright (c) 1992, 1993, 1994 518334Speter * The Regents of the University of California. All rights reserved. 618334Speter * 718334Speter * This code is derived from software contributed to Berkeley by 818334Speter * Henry Spencer of the University of Toronto. 918334Speter * 1018334Speter * Redistribution and use in source and binary forms, with or without 1118334Speter * modification, are permitted provided that the following conditions 1218334Speter * are met: 1318334Speter * 1. Redistributions of source code must retain the above copyright 1418334Speter * notice, this list of conditions and the following disclaimer. 1518334Speter * 2. Redistributions in binary form must reproduce the above copyright 1618334Speter * notice, this list of conditions and the following disclaimer in the 1718334Speter * documentation and/or other materials provided with the distribution. 1818334Speter * 3. All advertising materials mentioning features or use of this software 1918334Speter * must display the following acknowledgement: 2018334Speter * This product includes software developed by the University of 2118334Speter * California, Berkeley and its contributors. 2218334Speter * 4. Neither the name of the University nor the names of its contributors 2318334Speter * may be used to endorse or promote products derived from this software 2418334Speter * without specific prior written permission. 2518334Speter * 2618334Speter * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2718334Speter * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2818334Speter * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2918334Speter * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 3018334Speter * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 3118334Speter * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3218334Speter * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 3318334Speter * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3418334Speter * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3518334Speter * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3618334Speter * SUCH DAMAGE. 3718334Speter * 3818334Speter * @(#)regcomp.c 8.4 (Berkeley) 3/19/94 3918334Speter */ 4018334Speter 4118334Speter#include <sys/cdefs.h> 4218334Speter#if 0 4318334Speter#if defined(LIBC_SCCS) && !defined(lint) 4418334Speterstatic char sccsid[] = "@(#)regcomp.c 8.4 (Berkeley) 3/19/94"; 4518334Speter#endif /* LIBC_SCCS and not lint */ 4618334Speter#else 4718334Speter__RCSID("$NetBSD: regcomp.c,v 1.8 2021/05/17 04:01:57 rin Exp $"); 4818334Speter#endif 4918334Speter 5018334Speter#include <sys/types.h> 5118334Speter#include <stdio.h> 5218334Speter#include <string.h> 5318334Speter#include <ctype.h> 5418334Speter#include <limits.h> 5518334Speter#include <stdlib.h> 5618334Speter#include <regex.h> 5718334Speter 5818334Speter#include "utils.h" 5918334Speter#include "regex2.h" 6018334Speter 6118334Speter#include "cclass.h" 6218334Speter#include "cname.h" 6318334Speter 6418334Speter/* 6518334Speter * parse structure, passed up and down to avoid global variables and 6618334Speter * other clumsinesses 6718334Speter */ 6818334Speterstruct parse { 6918334Speter RCHAR_T *next; /* next character in RE */ 7018334Speter RCHAR_T *end; /* end of string (-> NUL normally) */ 7118334Speter int error; /* has an error been seen? */ 7218334Speter sop *strip; /* malloced strip */ 7318334Speter RCHAR_T *stripdata; /* malloced stripdata */ 7418334Speter sopno ssize; /* malloced strip size (allocated) */ 7518334Speter sopno slen; /* malloced strip length (used) */ 7618334Speter int ncsalloc; /* number of csets allocated */ 7718334Speter struct re_guts *g; 7818334Speter# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 7918334Speter sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 8018334Speter sopno pend[NPAREN]; /* -> ) ([0] unused) */ 8118334Speter}; 8218334Speter 8318334Speter/* ========= begin header generated by ./mkh ========= */ 8418334Speter#ifdef __cplusplus 8518334Speterextern "C" { 8618334Speter#endif 8718334Speter 8818334Speter/* === regcomp.c === */ 8918334Speterstatic void p_ere __P((struct parse *p, int stop, size_t reclimit)); 9018334Speterstatic void p_ere_exp __P((struct parse *p, size_t reclimit)); 9118334Speterstatic void p_str __P((struct parse *p)); 9218334Speterstatic void p_bre __P((struct parse *p, int end1, int end2, size_t reclimit)); 9318334Speterstatic int p_simp_re __P((struct parse *p, int starordinary, size_t reclimit)); 9418334Speterstatic int p_count __P((struct parse *p)); 9518334Speterstatic void p_bracket __P((struct parse *p)); 9618334Speterstatic void p_b_term __P((struct parse *p, cset *cs)); 9718334Speterstatic void p_b_cclass __P((struct parse *p, cset *cs)); 9818334Speterstatic void p_b_eclass __P((struct parse *p, cset *cs)); 9918334Speterstatic char p_b_symbol __P((struct parse *p)); 10018334Speterstatic char p_b_coll_elem __P((struct parse *p, int endc)); 10118334Speterstatic RCHAR_T othercase __P((int ch)); 10218334Speterstatic void bothcases __P((struct parse *p, int ch)); 10318334Speterstatic void ordinary __P((struct parse *p, int ch)); 10418334Speterstatic void nonnewline __P((struct parse *p)); 10518334Speterstatic void repeat __P((struct parse *p, sopno start, int from, int to, size_t reclimit)); 10618334Speterstatic int seterr __P((struct parse *p, int e)); 10718334Speterstatic cset *allocset __P((struct parse *p)); 10818334Speterstatic void freeset __P((struct parse *p, cset *cs)); 10918334Speterstatic int freezeset __P((struct parse *p, cset *cs)); 11018334Speterstatic int firstch __P((struct parse *p, cset *cs)); 11118334Speterstatic int nch __P((struct parse *p, cset *cs)); 11218334Speterstatic void mcadd __P((struct parse *p, cset *cs, const char *cp)); 11318334Speter#ifdef notdef 11418334Speterstatic void mcsub __P((cset *cs, char *cp)); 11518334Speterstatic int mcin __P((cset *cs, char *cp)); 11618334Speterstatic char *mcfind __P((cset *cs, char *cp)); 11718334Speter#endif 11818334Speterstatic void mcinvert __P((struct parse *p, cset *cs)); 11918334Speterstatic void mccase __P((struct parse *p, cset *cs)); 12018334Speter#ifdef notdef 12118334Speterstatic int isinsets __P((struct re_guts *g, int c)); 12218334Speterstatic int samesets __P((struct re_guts *g, int c1, int c2)); 12318334Speter#endif 12418334Speterstatic void categorize __P((struct parse *p, struct re_guts *g)); 12518334Speterstatic sopno dupl __P((struct parse *p, sopno start, sopno finish)); 12618334Speterstatic void doemit __P((struct parse *p, sop op, size_t opnd)); 12718334Speterstatic void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 12818334Speterstatic void dofwd __P((struct parse *p, sopno pos, sop value)); 12918334Speterstatic int enlarge __P((struct parse *p, sopno size)); 13018334Speterstatic void stripsnug __P((struct parse *p, struct re_guts *g)); 13118334Speterstatic void findmust __P((struct parse *p, struct re_guts *g)); 13218334Speterstatic sopno pluscount __P((struct parse *p, struct re_guts *g)); 13318334Speter 13418334Speter#ifdef __cplusplus 13518334Speter} 13618334Speter#endif 13718334Speter/* ========= end header generated by ./mkh ========= */ 13818334Speter 13918334Speterstatic RCHAR_T nuls[10]; /* place to point scanner in event of error */ 14018334Speter 14118334Speter/* 14218334Speter * macros for use with parse structure 14318334Speter * BEWARE: these know that the parse structure is named `p' !!! 14418334Speter */ 14518334Speter#define PEEK() (*p->next) 14618334Speter#define PEEK2() (*(p->next+1)) 14718334Speter#define MORE() (p->next < p->end) 14818334Speter#define MORE2() (p->next+1 < p->end) 14918334Speter#define SEE(c) (MORE() && PEEK() == (c)) 15018334Speter#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 15118334Speter#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 15218334Speter#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 15318334Speter#define NEXT() (p->next++) 15418334Speter#define NEXT2() (p->next += 2) 15518334Speter#define NEXTn(n) (p->next += (n)) 15618334Speter#define GETNEXT() (*p->next++) 15718334Speter#define SETERROR(e) seterr(p, (e)) 15818334Speter#define REQUIRE(co, e) ((co) || SETERROR(e)) 15918334Speter#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 16018334Speter#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 16118334Speter#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 16218334Speter#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 16318334Speter#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 16418334Speter#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 16518334Speter#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 16618334Speter#define HERE() (p->slen) 16718334Speter#define THERE() (p->slen - 1) 16818334Speter#define THERETHERE() (p->slen - 2) 16918334Speter#define DROP(n) (p->slen -= (n)) 17018334Speter 17118334Speter#ifndef NDEBUG 17218334Speterstatic int never = 0; /* for use in asserts; shuts lint up */ 17318334Speter#else 17418334Speter#define never 0 /* some <assert.h>s have bugs too */ 17518334Speter#endif 17618334Speter 17718334Speter#define MEMLIMIT 0x8000000 17818334Speter#define MEMSIZE(p) \ 17918334Speter ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \ 18018334Speter (p)->ncsalloc * sizeof(cset) + \ 18118334Speter (p)->ssize * sizeof(sop)) 18218334Speter#define RECLIMIT 256 18318334Speter 18418334Speter/* 18518334Speter - regcomp - interface for parser and compilation 18618334Speter = extern int regcomp(regex_t *, const RCHAR_T *, int); 18718334Speter = #define REG_BASIC 0000 18818334Speter = #define REG_EXTENDED 0001 18918334Speter = #define REG_ICASE 0002 19018334Speter = #define REG_NOSUB 0004 19118334Speter = #define REG_NEWLINE 0010 19218334Speter = #define REG_NOSPEC 0020 19318334Speter = #define REG_PEND 0040 19418334Speter = #define REG_DUMP 0200 19518334Speter */ 19618334Speterint /* 0 success, otherwise REG_something */ 19718334Speterregcomp(regex_t *preg, const RCHAR_T *pattern, int cflags) 19818334Speter{ 19918334Speter struct parse pa; 20018334Speter struct re_guts *g; 20118334Speter struct parse *p = &pa; 20218334Speter int i; 20318334Speter size_t len; 20418334Speter#ifdef REDEBUG 20518334Speter# define GOODFLAGS(f) (f) 20618334Speter#else 20718334Speter# define GOODFLAGS(f) ((f)&~REG_DUMP) 20818334Speter#endif 20918334Speter 21018334Speter cflags = GOODFLAGS(cflags); 21118334Speter if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 21218334Speter return(REG_INVARG); 21318334Speter 21418334Speter if (cflags®_PEND) { 21518334Speter if (preg->re_endp < pattern) 21618334Speter return(REG_INVARG); 21718334Speter len = preg->re_endp - pattern; 21818334Speter } else 21918334Speter len = STRLEN(pattern); 22018334Speter 22118334Speter /* do the mallocs early so failure handling is easy */ 22218334Speter g = (struct re_guts *)malloc(sizeof(struct re_guts) + 22318334Speter (NC-1)*sizeof(cat_t)); 22418334Speter if (g == NULL) 22518334Speter return(REG_ESPACE); 22618334Speter p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 22718334Speter p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 22818334Speter if (p->strip == NULL) { 22918334Speter free((char *)g); 23018334Speter return(REG_ESPACE); 23118334Speter } 23218334Speter p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T)); 23318334Speter if (p->stripdata == NULL) { 23418334Speter free((char *)p->strip); 23518334Speter free((char *)g); 23618334Speter return(REG_ESPACE); 23718334Speter } 23818334Speter p->slen = 0; 23918334Speter 24018334Speter /* set things up */ 24118334Speter p->g = g; 24218334Speter p->next = (RCHAR_T *)__UNCONST(pattern); /* convenience; we do not modify it */ 24318334Speter p->end = p->next + len; 24418334Speter p->error = 0; 24518334Speter p->ncsalloc = 0; 24618334Speter for (i = 0; i < NPAREN; i++) { 24718334Speter p->pbegin[i] = 0; 24818334Speter p->pend[i] = 0; 24918334Speter } 25018334Speter g->csetsize = NC; 25118334Speter g->sets = NULL; 25218334Speter g->setbits = NULL; 25318334Speter g->ncsets = 0; 25418334Speter g->cflags = cflags; 25518334Speter g->iflags = 0; 25618334Speter g->nbol = 0; 25718334Speter g->neol = 0; 25818334Speter g->must = NULL; 25918334Speter g->mlen = 0; 26018334Speter g->nsub = 0; 26118334Speter#if 0 26218334Speter g->ncategories = 1; /* category 0 is "everything else" */ 26318334Speter g->categories = &g->catspace[-(CHAR_MIN)]; 26418334Speter (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 26518334Speter#endif 26618334Speter g->backrefs = 0; 26718334Speter 26818334Speter /* do it */ 26918334Speter EMIT(OEND, 0); 27018334Speter g->firststate = THERE(); 27118334Speter if (cflags®_EXTENDED) 27218334Speter p_ere(p, OUT, 0); 27318334Speter else if (cflags®_NOSPEC) 27418334Speter p_str(p); 27518334Speter else 27618334Speter p_bre(p, OUT, OUT, 0); 27718334Speter EMIT(OEND, 0); 27818334Speter g->laststate = THERE(); 27918334Speter 28018334Speter /* tidy up loose ends and fill things in */ 28118334Speter categorize(p, g); 28218334Speter stripsnug(p, g); 28318334Speter findmust(p, g); 28418334Speter g->nplus = pluscount(p, g); 28518334Speter g->magic = MAGIC2; 28618334Speter preg->re_nsub = g->nsub; 28718334Speter preg->re_g = g; 28818334Speter preg->re_magic = MAGIC1; 28918334Speter#ifndef REDEBUG 29018334Speter /* not debugging, so can't rely on the assert() in regexec() */ 29118334Speter if (g->iflags&BAD) 29218334Speter SETERROR(REG_ASSERT); 29318334Speter#endif 29418334Speter 29518334Speter /* win or lose, we're done */ 29618334Speter if (p->error != 0) /* lose */ 29718334Speter regfree(preg); 29818334Speter return(p->error); 29918334Speter} 30018334Speter 30118334Speter/* 30218334Speter - p_ere - ERE parser top level, concatenation and alternation 30318334Speter == static void p_ere(struct parse *p, int stop, size_t reclimit); 30418334Speter */ 30518334Speterstatic void 30618334Speterp_ere(struct parse *p, int stop, size_t reclimit) 30718334Speter 30818334Speter /* character this ERE should end at */ 30918334Speter{ 31018334Speter char c; 31118334Speter sopno prevback = 0; 31218334Speter sopno prevfwd = 0; 31318334Speter sopno conc; 31418334Speter int first = 1; /* is this the first alternative? */ 31518334Speter 31618334Speter if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 31718334Speter p->error = REG_ESPACE; 31818334Speter return; 31918334Speter } 32018334Speter 32118334Speter for (;;) { 32218334Speter /* do a bunch of concatenated expressions */ 32318334Speter conc = HERE(); 32418334Speter while (MORE() && (c = PEEK()) != '|' && c != stop) 32518334Speter p_ere_exp(p, reclimit); 32618334Speter (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 32718334Speter 32818334Speter if (!EAT('|')) 32918334Speter break; /* NOTE BREAK OUT */ 330 331 if (first) { 332 INSERT(OCH_, conc); /* offset is wrong */ 333 prevfwd = conc; 334 prevback = conc; 335 first = 0; 336 } 337 ASTERN(OOR1, prevback); 338 prevback = THERE(); 339 AHEAD(prevfwd); /* fix previous offset */ 340 prevfwd = HERE(); 341 EMIT(OOR2, 0); /* offset is very wrong */ 342 } 343 344 if (!first) { /* tail-end fixups */ 345 AHEAD(prevfwd); 346 ASTERN(O_CH, prevback); 347 } 348 349 assert(!MORE() || SEE(stop)); 350} 351 352/* 353 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 354 == static void p_ere_exp(struct parse *p); 355 */ 356static void 357p_ere_exp(struct parse *p, size_t reclimit) 358{ 359 char c; 360 sopno pos; 361 int count; 362 int count2; 363 sopno subno; 364 int wascaret = 0; 365 366 assert(MORE()); /* caller should have ensured this */ 367 c = GETNEXT(); 368 369 pos = HERE(); 370 switch (c) { 371 case '(': 372 (void)REQUIRE(MORE(), REG_EPAREN); 373 p->g->nsub++; 374 subno = p->g->nsub; 375 if (subno < NPAREN) 376 p->pbegin[subno] = HERE(); 377 EMIT(OLPAREN, subno); 378 if (!SEE(')')) 379 p_ere(p, ')', reclimit); 380 if (subno < NPAREN) { 381 p->pend[subno] = HERE(); 382 assert(p->pend[subno] != 0); 383 } 384 EMIT(ORPAREN, subno); 385 (void)MUSTEAT(')', REG_EPAREN); 386 break; 387#ifndef POSIX_MISTAKE 388 case ')': /* happens only if no current unmatched ( */ 389 /* 390 * You may ask, why the ifndef? Because I didn't notice 391 * this until slightly too late for 1003.2, and none of the 392 * other 1003.2 regular-expression reviewers noticed it at 393 * all. So an unmatched ) is legal POSIX, at least until 394 * we can get it fixed. 395 */ 396 SETERROR(REG_EPAREN); 397 break; 398#endif 399 case '^': 400 EMIT(OBOL, 0); 401 p->g->iflags |= USEBOL; 402 p->g->nbol++; 403 wascaret = 1; 404 break; 405 case '$': 406 EMIT(OEOL, 0); 407 p->g->iflags |= USEEOL; 408 p->g->neol++; 409 break; 410 case '|': 411 SETERROR(REG_EMPTY); 412 break; 413 case '*': 414 case '+': 415 case '?': 416 SETERROR(REG_BADRPT); 417 break; 418 case '.': 419 if (p->g->cflags®_NEWLINE) 420 nonnewline(p); 421 else 422 EMIT(OANY, 0); 423 break; 424 case '[': 425 p_bracket(p); 426 break; 427 case '\\': 428 (void)REQUIRE(MORE(), REG_EESCAPE); 429 c = GETNEXT(); 430 ordinary(p, c); 431 break; 432 case '{': /* okay as ordinary except if digit follows */ 433 (void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT); 434 /* FALLTHROUGH */ 435 default: 436 ordinary(p, c); 437 break; 438 } 439 440 if (!MORE()) 441 return; 442 c = PEEK(); 443 /* we call { a repetition if followed by a digit */ 444 if (!( c == '*' || c == '+' || c == '?' || 445 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) )) 446 return; /* no repetition, we're done */ 447 NEXT(); 448 449 (void)REQUIRE(!wascaret, REG_BADRPT); 450 switch (c) { 451 case '*': /* implemented as +? */ 452 /* this case does not require the (y|) trick, noKLUDGE */ 453 INSERT(OPLUS_, pos); 454 ASTERN(O_PLUS, pos); 455 INSERT(OQUEST_, pos); 456 ASTERN(O_QUEST, pos); 457 break; 458 case '+': 459 INSERT(OPLUS_, pos); 460 ASTERN(O_PLUS, pos); 461 break; 462 case '?': 463 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 464 INSERT(OCH_, pos); /* offset slightly wrong */ 465 ASTERN(OOR1, pos); /* this one's right */ 466 AHEAD(pos); /* fix the OCH_ */ 467 EMIT(OOR2, 0); /* offset very wrong... */ 468 AHEAD(THERE()); /* ...so fix it */ 469 ASTERN(O_CH, THERETHERE()); 470 break; 471 case '{': 472 count = p_count(p); 473 if (EAT(',')) { 474 if (ISDIGIT((UCHAR_T)PEEK())) { 475 count2 = p_count(p); 476 (void)REQUIRE(count <= count2, REG_BADBR); 477 } else /* single number with comma */ 478 count2 = INFINITY; 479 } else /* just a single number */ 480 count2 = count; 481 repeat(p, pos, count, count2, 0); 482 if (!EAT('}')) { /* error heuristics */ 483 while (MORE() && PEEK() != '}') 484 NEXT(); 485 (void)REQUIRE(MORE(), REG_EBRACE); 486 SETERROR(REG_BADBR); 487 } 488 break; 489 } 490 491 if (!MORE()) 492 return; 493 c = PEEK(); 494 if (!( c == '*' || c == '+' || c == '?' || 495 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) ) 496 return; 497 SETERROR(REG_BADRPT); 498} 499 500/* 501 - p_str - string (no metacharacters) "parser" 502 == static void p_str(struct parse *p); 503 */ 504static void 505p_str(struct parse *p) 506{ 507 (void)REQUIRE(MORE(), REG_EMPTY); 508 while (MORE()) 509 ordinary(p, GETNEXT()); 510} 511 512/* 513 - p_bre - BRE parser top level, anchoring and concatenation 514 == static void p_bre(struct parse *p, int end1, \ 515 == int end2, size_t reclimit); 516 * Giving end1 as OUT essentially eliminates the end1/end2 check. 517 * 518 * This implementation is a bit of a kludge, in that a trailing $ is first 519 * taken as an ordinary character and then revised to be an anchor. The 520 * only undesirable side effect is that '$' gets included as a character 521 * category in such cases. This is fairly harmless; not worth fixing. 522 * The amount of lookahead needed to avoid this kludge is excessive. 523 */ 524static void 525p_bre(struct parse *p, int end1, int end2, size_t reclimit) 526 527 /* first terminating character */ 528 /* second terminating character */ 529{ 530 sopno start; 531 int first = 1; /* first subexpression? */ 532 int wasdollar = 0; 533 534 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 535 p->error = REG_ESPACE; 536 return; 537 } 538 539 start = HERE(); 540 541 if (EAT('^')) { 542 EMIT(OBOL, 0); 543 p->g->iflags |= USEBOL; 544 p->g->nbol++; 545 } 546 while (MORE() && !SEETWO(end1, end2)) { 547 wasdollar = p_simp_re(p, first, reclimit); 548 first = 0; 549 } 550 if (wasdollar) { /* oops, that was a trailing anchor */ 551 DROP(1); 552 EMIT(OEOL, 0); 553 p->g->iflags |= USEEOL; 554 p->g->neol++; 555 } 556 557 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 558} 559 560/* 561 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 562 == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 563 */ 564static int /* was the simple RE an unbackslashed $? */ 565p_simp_re(struct parse *p, int starordinary, size_t reclimit) 566 567 /* is a leading * an ordinary character? */ 568{ 569 int c; 570 int count; 571 int count2; 572 sopno pos; 573 int i; 574 sopno subno; 575 int backsl; 576 577 pos = HERE(); /* repetion op, if any, covers from here */ 578 579 assert(MORE()); /* caller should have ensured this */ 580 c = GETNEXT(); 581 backsl = c == '\\'; 582 if (backsl) { 583 (void)REQUIRE(MORE(), REG_EESCAPE); 584 c = (unsigned char)GETNEXT(); 585 switch (c) { 586 case '{': 587 SETERROR(REG_BADRPT); 588 break; 589 case '(': 590 p->g->nsub++; 591 subno = p->g->nsub; 592 if (subno < NPAREN) 593 p->pbegin[subno] = HERE(); 594 EMIT(OLPAREN, subno); 595 /* the MORE here is an error heuristic */ 596 if (MORE() && !SEETWO('\\', ')')) 597 p_bre(p, '\\', ')', reclimit); 598 if (subno < NPAREN) { 599 p->pend[subno] = HERE(); 600 assert(p->pend[subno] != 0); 601 } 602 EMIT(ORPAREN, subno); 603 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 604 break; 605 case ')': /* should not get here -- must be user */ 606 case '}': 607 SETERROR(REG_EPAREN); 608 break; 609 case '1': 610 case '2': 611 case '3': 612 case '4': 613 case '5': 614 case '6': 615 case '7': 616 case '8': 617 case '9': 618 i = c - '0'; 619 assert(i < NPAREN); 620 if (p->pend[i] != 0) { 621 assert(i <= p->g->nsub); 622 EMIT(OBACK_, i); 623 assert(p->pbegin[i] != 0); 624 assert(p->strip[p->pbegin[i]] == OLPAREN); 625 assert(p->strip[p->pend[i]] == ORPAREN); 626 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 627 EMIT(O_BACK, i); 628 } else 629 SETERROR(REG_ESUBREG); 630 p->g->backrefs = 1; 631 break; 632 default: 633 ordinary(p, c); 634 break; 635 } 636 } else { 637 switch (c) { 638 case '.': 639 if (p->g->cflags®_NEWLINE) 640 nonnewline(p); 641 else 642 EMIT(OANY, 0); 643 break; 644 case '[': 645 p_bracket(p); 646 break; 647 case '*': 648 (void)REQUIRE(starordinary, REG_BADRPT); 649 /* FALLTHROUGH */ 650 default: 651 ordinary(p, c); 652 break; 653 } 654 } 655 656 if (EAT('*')) { /* implemented as +? */ 657 /* this case does not require the (y|) trick, noKLUDGE */ 658 INSERT(OPLUS_, pos); 659 ASTERN(O_PLUS, pos); 660 INSERT(OQUEST_, pos); 661 ASTERN(O_QUEST, pos); 662 } else if (EATTWO('\\', '{')) { 663 count = p_count(p); 664 if (EAT(',')) { 665 if (MORE() && ISDIGIT((UCHAR_T)PEEK())) { 666 count2 = p_count(p); 667 (void)REQUIRE(count <= count2, REG_BADBR); 668 } else /* single number with comma */ 669 count2 = INFINITY; 670 } else /* just a single number */ 671 count2 = count; 672 repeat(p, pos, count, count2, reclimit); 673 if (!EATTWO('\\', '}')) { /* error heuristics */ 674 while (MORE() && !SEETWO('\\', '}')) 675 NEXT(); 676 (void)REQUIRE(MORE(), REG_EBRACE); 677 SETERROR(REG_BADBR); 678 } 679 } else if (!backsl && c == (unsigned char)'$') /* $ (but not \$) ends it */ 680 return(1); 681 682 return(0); 683} 684 685/* 686 - p_count - parse a repetition count 687 == static int p_count(struct parse *p); 688 */ 689static int /* the value */ 690p_count(struct parse *p) 691{ 692 int count = 0; 693 int ndigits = 0; 694 695 while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) { 696 count = count*10 + (GETNEXT() - '0'); 697 ndigits++; 698 } 699 700 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 701 return(count); 702} 703 704/* 705 - p_bracket - parse a bracketed character list 706 == static void p_bracket(struct parse *p); 707 * 708 * Note a significant property of this code: if the allocset() did SETERROR, 709 * no set operations are done. 710 */ 711static void 712p_bracket(struct parse *p) 713{ 714 cset *cs; 715 int invert = 0; 716 static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' }; 717 static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' }; 718 719 cs = allocset(p); 720 if (cs == NULL) 721 return; 722 723 /* Dept of Truly Sickening Special-Case Kludges */ 724 if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) { 725 EMIT(OBOW, 0); 726 NEXTn(6); 727 return; 728 } 729 if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) { 730 EMIT(OEOW, 0); 731 NEXTn(6); 732 return; 733 } 734 735 if (EAT('^')) 736 invert++; /* make note to invert set at end */ 737 if (EAT(']')) 738 CHadd(cs, ']'); 739 else if (EAT('-')) 740 CHadd(cs, '-'); 741 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 742 p_b_term(p, cs); 743 if (EAT('-')) 744 CHadd(cs, '-'); 745 (void)MUSTEAT(']', REG_EBRACK); 746 747 if (p->error != 0) /* don't mess things up further */ 748 return; 749 750 if (p->g->cflags®_ICASE) { 751 int i; 752 int ci; 753 754 for (i = p->g->csetsize - 1; i >= 0; i--) 755 if (CHIN(cs, i) && ISALPHA2(i)) { 756 ci = othercase(i); 757 if (ci != i) 758 CHadd(cs, ci); 759 } 760 if (cs->multis != NULL) 761 mccase(p, cs); 762 } 763 if (invert) { 764 int i; 765 766 for (i = p->g->csetsize - 1; i >= 0; i--) 767 if (CHIN(cs, i)) 768 CHsub(cs, i); 769 else 770 CHadd(cs, i); 771 if (p->g->cflags®_NEWLINE) 772 CHsub(cs, '\n'); 773 if (cs->multis != NULL) 774 mcinvert(p, cs); 775 } 776 777 assert(cs->multis == NULL); /* xxx */ 778 779 if (nch(p, cs) == 1) { /* optimize singleton sets */ 780 ordinary(p, firstch(p, cs)); 781 freeset(p, cs); 782 } else 783 EMIT(OANYOF, freezeset(p, cs)); 784} 785 786/* 787 - p_b_term - parse one term of a bracketed character list 788 == static void p_b_term(struct parse *p, cset *cs); 789 */ 790static void 791p_b_term(struct parse *p, cset *cs) 792{ 793 char c; 794 char start, finish; 795 int i; 796 797 /* classify what we've got */ 798 switch ((MORE()) ? PEEK() : '\0') { 799 case '[': 800 c = (MORE2()) ? PEEK2() : '\0'; 801 break; 802 case '-': 803 SETERROR(REG_ERANGE); 804 return; /* NOTE RETURN */ 805 break; 806 default: 807 c = '\0'; 808 break; 809 } 810 811 switch (c) { 812 case ':': /* character class */ 813 NEXT2(); 814 (void)REQUIRE(MORE(), REG_EBRACK); 815 c = PEEK(); 816 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 817 p_b_cclass(p, cs); 818 (void)REQUIRE(MORE(), REG_EBRACK); 819 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 820 break; 821 case '=': /* equivalence class */ 822 NEXT2(); 823 (void)REQUIRE(MORE(), REG_EBRACK); 824 c = PEEK(); 825 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 826 p_b_eclass(p, cs); 827 (void)REQUIRE(MORE(), REG_EBRACK); 828 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 829 break; 830 default: /* symbol, ordinary character, or range */ 831/* xxx revision needed for multichar stuff */ 832 start = p_b_symbol(p); 833 if (SEE('-') && MORE2() && PEEK2() != ']') { 834 /* range */ 835 NEXT(); 836 if (EAT('-')) 837 finish = '-'; 838 else 839 finish = p_b_symbol(p); 840 } else 841 finish = start; 842/* xxx what about signed chars here... */ 843 (void)REQUIRE(start <= finish, REG_ERANGE); 844 for (i = start; i <= finish; i++) 845 CHadd(cs, i); 846 break; 847 } 848} 849 850/* 851 - p_b_cclass - parse a character-class name and deal with it 852 == static void p_b_cclass(struct parse *p, cset *cs); 853 */ 854static void 855p_b_cclass(struct parse *p, cset *cs) 856{ 857 RCHAR_T *sp = p->next; 858 struct cclass *cp; 859 size_t len; 860 const char *u; 861 char c; 862 863 while (MORE() && ISALPHA2(PEEK())) 864 NEXT(); 865 len = p->next - sp; 866 for (cp = cclasses; cp->name != NULL; cp++) 867 if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len)) 868 break; 869 if (cp->name == NULL) { 870 /* oops, didn't find it */ 871 SETERROR(REG_ECTYPE); 872 return; 873 } 874 875 u = cp->chars; 876 while ((c = *u++) != '\0') 877 CHadd(cs, c); 878 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 879 MCadd(p, cs, u); 880} 881 882/* 883 - p_b_eclass - parse an equivalence-class name and deal with it 884 == static void p_b_eclass(struct parse *p, cset *cs); 885 * 886 * This implementation is incomplete. xxx 887 */ 888static void 889p_b_eclass(struct parse *p, cset *cs) 890{ 891 char c; 892 893 c = p_b_coll_elem(p, '='); 894 CHadd(cs, c); 895} 896 897/* 898 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 899 == static char p_b_symbol(struct parse *p); 900 */ 901static char /* value of symbol */ 902p_b_symbol(struct parse *p) 903{ 904 char value; 905 906 (void)REQUIRE(MORE(), REG_EBRACK); 907 if (!EATTWO('[', '.')) 908 return(GETNEXT()); 909 910 /* collating symbol */ 911 value = p_b_coll_elem(p, '.'); 912 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 913 return(value); 914} 915 916/* 917 - p_b_coll_elem - parse a collating-element name and look it up 918 == static char p_b_coll_elem(struct parse *p, int endc); 919 */ 920static char /* value of collating element */ 921p_b_coll_elem(struct parse *p, int endc) 922 923 /* name ended by endc,']' */ 924{ 925 RCHAR_T *sp = p->next; 926 struct cname *cp; 927 size_t len; 928 929 while (MORE() && !SEETWO(endc, ']')) 930 NEXT(); 931 if (!MORE()) { 932 SETERROR(REG_EBRACK); 933 return(0); 934 } 935 len = p->next - sp; 936 for (cp = cnames; cp->name != NULL; cp++) 937 if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len)) 938 return(cp->code); /* known name */ 939 if (len == 1) 940 return(*sp); /* single character */ 941 SETERROR(REG_ECOLLATE); /* neither */ 942 return(0); 943} 944 945/* 946 - othercase - return the case counterpart of an alphabetic 947 == static char othercase(int ch); 948 */ 949static RCHAR_T /* if no counterpart, return ch */ 950othercase(int ch) 951{ 952 assert(ISALPHA2(ch)); 953 if (ISUPPER(ch)) 954 return(TOLOWER(ch)); 955 else if (ISLOWER(ch)) 956 return(TOUPPER(ch)); 957 else /* peculiar, but could happen */ 958 return(ch); 959} 960 961/* 962 - bothcases - emit a dualcase version of a two-case character 963 == static void bothcases(struct parse *p, int ch); 964 * 965 * Boy, is this implementation ever a kludge... 966 */ 967static void 968bothcases(struct parse *p, int ch) 969{ 970 RCHAR_T *oldnext = p->next; 971 RCHAR_T *oldend = p->end; 972 RCHAR_T bracket[3]; 973 974 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 975 p->next = bracket; 976 p->end = bracket+2; 977 bracket[0] = ch; 978 bracket[1] = ']'; 979 bracket[2] = '\0'; 980 p_bracket(p); 981 assert(p->next == bracket+2); 982 p->next = oldnext; 983 p->end = oldend; 984} 985 986/* 987 - ordinary - emit an ordinary character 988 == static void ordinary(struct parse *p, int ch); 989 */ 990static void 991ordinary(struct parse *p, int ch) 992{ 993/* 994 cat_t *cap = p->g->categories; 995*/ 996 997 if ((p->g->cflags®_ICASE) && ISALPHA2(ch) && othercase(ch) != ch) 998 bothcases(p, ch); 999 else { 1000 EMIT(OCHAR, (UCHAR_T)ch); 1001/* 1002 if (cap[ch] == 0) 1003 cap[ch] = p->g->ncategories++; 1004*/ 1005 } 1006} 1007 1008/* 1009 - nonnewline - emit REG_NEWLINE version of OANY 1010 == static void nonnewline(struct parse *p); 1011 * 1012 * Boy, is this implementation ever a kludge... 1013 */ 1014static void 1015nonnewline(struct parse *p) 1016{ 1017 RCHAR_T *oldnext = p->next; 1018 RCHAR_T *oldend = p->end; 1019 RCHAR_T bracket[4]; 1020 1021 p->next = bracket; 1022 p->end = bracket+3; 1023 bracket[0] = '^'; 1024 bracket[1] = '\n'; 1025 bracket[2] = ']'; 1026 bracket[3] = '\0'; 1027 p_bracket(p); 1028 assert(p->next == bracket+3); 1029 p->next = oldnext; 1030 p->end = oldend; 1031} 1032 1033/* 1034 - repeat - generate code for a bounded repetition, recursively if needed 1035 == static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit); 1036 */ 1037static void 1038repeat(struct parse *p, sopno start, int from, int to, size_t reclimit) 1039 1040 /* operand from here to end of strip */ 1041 /* repeated from this number */ 1042 /* to this number of times (maybe INFINITY) */ 1043{ 1044 sopno finish; 1045# define N 2 1046# define INF 3 1047# define REP(f, t) ((f)*8 + (t)) 1048# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1049 sopno copy; 1050 1051 if (reclimit++ > RECLIMIT) 1052 p->error = REG_ESPACE; 1053 if (p->error) 1054 return; 1055 1056 finish = HERE(); 1057 1058 assert(from <= to); 1059 1060 switch (REP(MAP(from), MAP(to))) { 1061 case REP(0, 0): /* must be user doing this */ 1062 DROP(finish-start); /* drop the operand */ 1063 break; 1064 case REP(0, 1): /* as x{1,1}? */ 1065 case REP(0, N): /* as x{1,n}? */ 1066 case REP(0, INF): /* as x{1,}? */ 1067 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1068 INSERT(OCH_, start); /* offset is wrong... */ 1069 repeat(p, start+1, 1, to, reclimit); 1070 ASTERN(OOR1, start); 1071 AHEAD(start); /* ... fix it */ 1072 EMIT(OOR2, 0); 1073 AHEAD(THERE()); 1074 ASTERN(O_CH, THERETHERE()); 1075 break; 1076 case REP(1, 1): /* trivial case */ 1077 /* done */ 1078 break; 1079 case REP(1, N): /* as x?x{1,n-1} */ 1080 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1081 INSERT(OCH_, start); 1082 ASTERN(OOR1, start); 1083 AHEAD(start); 1084 EMIT(OOR2, 0); /* offset very wrong... */ 1085 AHEAD(THERE()); /* ...so fix it */ 1086 ASTERN(O_CH, THERETHERE()); 1087 copy = dupl(p, start+1, finish+1); 1088 assert(copy == finish+4); 1089 repeat(p, copy, 1, to-1, reclimit); 1090 break; 1091 case REP(1, INF): /* as x+ */ 1092 INSERT(OPLUS_, start); 1093 ASTERN(O_PLUS, start); 1094 break; 1095 case REP(N, N): /* as xx{m-1,n-1} */ 1096 copy = dupl(p, start, finish); 1097 repeat(p, copy, from-1, to-1, reclimit); 1098 break; 1099 case REP(N, INF): /* as xx{n-1,INF} */ 1100 copy = dupl(p, start, finish); 1101 repeat(p, copy, from-1, to, reclimit); 1102 break; 1103 default: /* "can't happen" */ 1104 SETERROR(REG_ASSERT); /* just in case */ 1105 break; 1106 } 1107} 1108 1109/* 1110 - seterr - set an error condition 1111 == static int seterr(struct parse *p, int e); 1112 */ 1113static int /* useless but makes type checking happy */ 1114seterr(struct parse *p, int e) 1115{ 1116 if (p->error == 0) /* keep earliest error condition */ 1117 p->error = e; 1118 p->next = nuls; /* try to bring things to a halt */ 1119 p->end = nuls; 1120 return(0); /* make the return value well-defined */ 1121} 1122 1123/* 1124 - allocset - allocate a set of characters for [] 1125 == static cset *allocset(struct parse *p); 1126 */ 1127static cset * 1128allocset(struct parse *p) 1129{ 1130 int no = p->g->ncsets++; 1131 size_t nc; 1132 size_t nbytes; 1133 cset *cs; 1134 size_t css = (size_t)p->g->csetsize; 1135 int i; 1136 1137 if (no >= p->ncsalloc) { /* need another column of space */ 1138 p->ncsalloc += CHAR_BIT; 1139 nc = p->ncsalloc; 1140 assert(nc % CHAR_BIT == 0); 1141 nbytes = nc / CHAR_BIT * css; 1142 if (MEMSIZE(p) > MEMLIMIT) 1143 goto oomem; 1144 if (p->g->sets == NULL) 1145 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1146 else 1147 p->g->sets = (cset *)realloc((char *)p->g->sets, 1148 nc * sizeof(cset)); 1149 if (p->g->setbits == NULL) 1150 p->g->setbits = (uch *)malloc(nbytes); 1151 else { 1152 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1153 nbytes); 1154 /* xxx this isn't right if setbits is now NULL */ 1155 for (i = 0; i < no; i++) 1156 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1157 } 1158 if (p->g->sets != NULL && p->g->setbits != NULL) 1159 (void) memset((char *)p->g->setbits + (nbytes - css), 1160 0, css); 1161 else { 1162oomem: 1163 no = 0; 1164 SETERROR(REG_ESPACE); 1165 /* caller's responsibility not to do set ops */ 1166 return NULL; 1167 } 1168 } 1169 1170 cs = &p->g->sets[no]; 1171 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1172 cs->mask = 1 << ((no) % CHAR_BIT); 1173 cs->hash = 0; 1174 cs->smultis = 0; 1175 cs->multis = NULL; 1176 1177 return(cs); 1178} 1179 1180/* 1181 - freeset - free a now-unused set 1182 == static void freeset(struct parse *p, cset *cs); 1183 */ 1184static void 1185freeset(struct parse *p, cset *cs) 1186{ 1187 size_t i; 1188 cset *top = &p->g->sets[p->g->ncsets]; 1189 size_t css = (size_t)p->g->csetsize; 1190 1191 for (i = 0; i < css; i++) 1192 CHsub(cs, i); 1193 if (cs == top-1) /* recover only the easy case */ 1194 p->g->ncsets--; 1195} 1196 1197/* 1198 - freezeset - final processing on a set of characters 1199 == static int freezeset(struct parse *p, cset *cs); 1200 * 1201 * The main task here is merging identical sets. This is usually a waste 1202 * of time (although the hash code minimizes the overhead), but can win 1203 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1204 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1205 * the same value! 1206 */ 1207static int /* set number */ 1208freezeset(struct parse *p, cset *cs) 1209{ 1210 uch h = cs->hash; 1211 size_t i; 1212 cset *top = &p->g->sets[p->g->ncsets]; 1213 cset *cs2; 1214 size_t css = (size_t)p->g->csetsize; 1215 1216 /* look for an earlier one which is the same */ 1217 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1218 if (cs2->hash == h && cs2 != cs) { 1219 /* maybe */ 1220 for (i = 0; i < css; i++) 1221 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1222 break; /* no */ 1223 if (i == css) 1224 break; /* yes */ 1225 } 1226 1227 if (cs2 < top) { /* found one */ 1228 freeset(p, cs); 1229 cs = cs2; 1230 } 1231 1232 return((int)(cs - p->g->sets)); 1233} 1234 1235/* 1236 - firstch - return first character in a set (which must have at least one) 1237 == static int firstch(struct parse *p, cset *cs); 1238 */ 1239static int /* character; there is no "none" value */ 1240firstch(struct parse *p, cset *cs) 1241{ 1242 size_t i; 1243 size_t css = (size_t)p->g->csetsize; 1244 1245 for (i = 0; i < css; i++) 1246 if (CHIN(cs, i)) 1247 return((char)i); 1248 assert(never); 1249 return(0); /* arbitrary */ 1250} 1251 1252/* 1253 - nch - number of characters in a set 1254 == static int nch(struct parse *p, cset *cs); 1255 */ 1256static int 1257nch(struct parse *p, cset *cs) 1258{ 1259 size_t i; 1260 size_t css = (size_t)p->g->csetsize; 1261 int n = 0; 1262 1263 for (i = 0; i < css; i++) 1264 if (CHIN(cs, i)) 1265 n++; 1266 return(n); 1267} 1268 1269/* 1270 - mcadd - add a collating element to a cset 1271 == static void mcadd(struct parse *p, cset *cs, \ 1272 == char *cp); 1273 */ 1274static void 1275mcadd(struct parse *p, cset *cs, const char *cp) 1276{ 1277 size_t oldend = cs->smultis; 1278 1279 cs->smultis += strlen(cp) + 1; 1280 if (cs->multis == NULL) 1281 cs->multis = malloc(cs->smultis); 1282 else 1283 cs->multis = realloc(cs->multis, cs->smultis); 1284 if (cs->multis == NULL) { 1285 SETERROR(REG_ESPACE); 1286 return; 1287 } 1288 1289 (void) strcpy(cs->multis + oldend - 1, cp); 1290 cs->multis[cs->smultis - 1] = '\0'; 1291} 1292 1293#ifdef notdef 1294/* 1295 - mcsub - subtract a collating element from a cset 1296 == static void mcsub(cset *cs, char *cp); 1297 */ 1298static void 1299mcsub(cset *cs, char *cp) 1300{ 1301 char *fp = mcfind(cs, cp); 1302 size_t len = strlen(fp); 1303 1304 assert(fp != NULL); 1305 (void) memmove(fp, fp + len + 1, 1306 cs->smultis - (fp + len + 1 - cs->multis)); 1307 cs->smultis -= len; 1308 1309 if (cs->smultis == 0) { 1310 free(cs->multis); 1311 cs->multis = NULL; 1312 return; 1313 } 1314 1315 cs->multis = realloc(cs->multis, cs->smultis); 1316 assert(cs->multis != NULL); 1317} 1318 1319/* 1320 - mcin - is a collating element in a cset? 1321 == static int mcin(cset *cs, char *cp); 1322 */ 1323static int 1324mcin(cset *cs, char *cp) 1325{ 1326 return(mcfind(cs, cp) != NULL); 1327} 1328 1329/* 1330 - mcfind - find a collating element in a cset 1331 == static char *mcfind(cset *cs, char *cp); 1332 */ 1333static char * 1334mcfind(cset *cs, char *cp) 1335{ 1336 char *p; 1337 1338 if (cs->multis == NULL) 1339 return(NULL); 1340 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1341 if (strcmp(cp, p) == 0) 1342 return(p); 1343 return(NULL); 1344} 1345#endif 1346 1347/* 1348 - mcinvert - invert the list of collating elements in a cset 1349 == static void mcinvert(struct parse *p, cset *cs); 1350 * 1351 * This would have to know the set of possibilities. Implementation 1352 * is deferred. 1353 */ 1354static void 1355mcinvert(struct parse *p, cset *cs) 1356{ 1357 assert(cs->multis == NULL); /* xxx */ 1358} 1359 1360/* 1361 - mccase - add case counterparts of the list of collating elements in a cset 1362 == static void mccase(struct parse *p, cset *cs); 1363 * 1364 * This would have to know the set of possibilities. Implementation 1365 * is deferred. 1366 */ 1367static void 1368mccase(struct parse *p, cset *cs) 1369{ 1370 assert(cs->multis == NULL); /* xxx */ 1371} 1372 1373#ifdef notdef 1374/* 1375 - isinsets - is this character in any sets? 1376 == static int isinsets(struct re_guts *g, int c); 1377 */ 1378static int /* predicate */ 1379isinsets(struct re_guts *g, int c) 1380{ 1381 uch *col; 1382 int i; 1383 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1384 unsigned uc = (unsigned char)c; 1385 1386 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1387 if (col[uc] != 0) 1388 return(1); 1389 return(0); 1390} 1391 1392/* 1393 - samesets - are these two characters in exactly the same sets? 1394 == static int samesets(struct re_guts *g, int c1, int c2); 1395 */ 1396static int /* predicate */ 1397samesets(struct re_guts *g, int c1, int c2) 1398{ 1399 uch *col; 1400 int i; 1401 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1402 unsigned uc1 = (unsigned char)c1; 1403 unsigned uc2 = (unsigned char)c2; 1404 1405 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1406 if (col[uc1] != col[uc2]) 1407 return(0); 1408 return(1); 1409} 1410#endif 1411 1412/* 1413 - categorize - sort out character categories 1414 == static void categorize(struct parse *p, struct re_guts *g); 1415 */ 1416static void 1417categorize(struct parse *p, struct re_guts *g) 1418{ 1419#ifdef notdef 1420 cat_t *cats = g->categories; 1421 int c; 1422 int c2; 1423 cat_t cat; 1424 1425 /* avoid making error situations worse */ 1426 if (p->error != 0) 1427 return; 1428 1429 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1430 if (cats[c] == 0 && isinsets(g, c)) { 1431 cat = g->ncategories++; 1432 cats[c] = cat; 1433 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1434 if (cats[c2] == 0 && samesets(g, c, c2)) 1435 cats[c2] = cat; 1436 } 1437#endif 1438} 1439 1440/* 1441 - dupl - emit a duplicate of a bunch of sops 1442 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1443 */ 1444static sopno /* start of duplicate */ 1445dupl(struct parse *p, sopno start, sopno finish) 1446 1447 /* from here */ 1448 /* to this less one */ 1449{ 1450 sopno ret = HERE(); 1451 sopno len = finish - start; 1452 1453 assert(finish >= start); 1454 if (len == 0) 1455 return(ret); 1456 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1457 return ret; 1458 assert(p->ssize >= p->slen + len); 1459 (void) memcpy((char *)(p->strip + p->slen), 1460 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1461 (void) memcpy((char *)(p->stripdata + p->slen), 1462 (char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T)); 1463 p->slen += len; 1464 return(ret); 1465} 1466 1467/* 1468 - doemit - emit a strip operator 1469 == static void doemit(struct parse *p, sop op, size_t opnd); 1470 * 1471 * It might seem better to implement this as a macro with a function as 1472 * hard-case backup, but it's just too big and messy unless there are 1473 * some changes to the data structures. Maybe later. 1474 */ 1475static void 1476doemit(struct parse *p, sop op, size_t opnd) 1477{ 1478 /* avoid making error situations worse */ 1479 if (p->error != 0) 1480 return; 1481 1482 /* deal with oversize operands ("can't happen", more or less) */ 1483 assert(opnd < 1); 1484 1485 /* deal with undersized strip */ 1486 if (p->slen >= p->ssize) 1487 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1488 return; 1489 1490 /* finally, it's all reduced to the easy case */ 1491 p->strip[p->slen] = op; 1492 p->stripdata[p->slen] = opnd; 1493 p->slen++; 1494} 1495 1496/* 1497 - doinsert - insert a sop into the strip 1498 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1499 */ 1500static void 1501doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1502{ 1503 sopno sn; 1504 sop s; 1505 RCHAR_T d; 1506 int i; 1507 1508 /* avoid making error situations worse */ 1509 if (p->error != 0) 1510 return; 1511 1512 sn = HERE(); 1513 EMIT(op, opnd); /* do checks, ensure space */ 1514 assert(HERE() == sn+1); 1515 s = p->strip[sn]; 1516 d = p->stripdata[sn]; 1517 1518 /* adjust paren pointers */ 1519 assert(pos > 0); 1520 for (i = 1; i < NPAREN; i++) { 1521 if (p->pbegin[i] >= pos) { 1522 p->pbegin[i]++; 1523 } 1524 if (p->pend[i] >= pos) { 1525 p->pend[i]++; 1526 } 1527 } 1528 1529 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1530 (HERE()-pos-1)*sizeof(sop)); 1531 memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos], 1532 (HERE()-pos-1)*sizeof(RCHAR_T)); 1533 p->strip[pos] = s; 1534 p->stripdata[pos] = d; 1535} 1536 1537/* 1538 - dofwd - complete a forward reference 1539 == static void dofwd(struct parse *p, sopno pos, sop value); 1540 */ 1541static void 1542dofwd(struct parse *p, sopno pos, sop value) 1543{ 1544 /* avoid making error situations worse */ 1545 if (p->error != 0) 1546 return; 1547 1548 assert(value < 1); 1549 p->stripdata[pos] = value; 1550} 1551 1552/* 1553 - enlarge - enlarge the strip 1554 == static int enlarge(struct parse *p, sopno size); 1555 */ 1556static int 1557enlarge(struct parse *p, sopno size) 1558{ 1559 sop *sp; 1560 RCHAR_T *dp; 1561 sopno osize; 1562 1563 if (p->ssize >= size) 1564 return 1; 1565 1566 osize = p->ssize; 1567 p->ssize = size; 1568 if (MEMSIZE(p) > MEMLIMIT) 1569 goto oomem; 1570 sp = realloc(p->strip, p->ssize * sizeof(sop)); 1571 if (sp == NULL) 1572 goto oomem; 1573 p->strip = sp; 1574 dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T)); 1575 if (dp == NULL) { 1576oomem: 1577 p->ssize = osize; 1578 SETERROR(REG_ESPACE); 1579 return 0; 1580 } 1581 p->stripdata = dp; 1582 return 1; 1583} 1584 1585/* 1586 - stripsnug - compact the strip 1587 == static void stripsnug(struct parse *p, struct re_guts *g); 1588 */ 1589static void 1590stripsnug(struct parse *p, struct re_guts *g) 1591{ 1592 g->nstates = p->slen; 1593 g->strip = (sop *)realloc((char *)p->strip, 1594 p->slen * sizeof(sop)); 1595 if (g->strip == NULL) { 1596 SETERROR(REG_ESPACE); 1597 g->strip = p->strip; 1598 } 1599 g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata, 1600 p->slen * sizeof(RCHAR_T)); 1601 if (g->stripdata == NULL) { 1602 SETERROR(REG_ESPACE); 1603 g->stripdata = p->stripdata; 1604 } 1605} 1606 1607/* 1608 - findmust - fill in must and mlen with longest mandatory literal string 1609 == static void findmust(struct parse *p, struct re_guts *g); 1610 * 1611 * This algorithm could do fancy things like analyzing the operands of | 1612 * for common subsequences. Someday. This code is simple and finds most 1613 * of the interesting cases. 1614 * 1615 * Note that must and mlen got initialized during setup. 1616 */ 1617static void 1618findmust(struct parse *p, struct re_guts *g) 1619{ 1620 sop *scans; 1621 RCHAR_T *scand; 1622 sop *starts = 0; 1623 RCHAR_T *startd = NULL; 1624 sop *newstarts = 0; 1625 RCHAR_T *newstartd = NULL; 1626 sopno newlen; 1627 sop s; 1628 RCHAR_T d; 1629 RCHAR_T *cp; 1630 sopno i; 1631 1632 /* avoid making error situations worse */ 1633 if (p->error != 0) 1634 return; 1635 1636 /* find the longest OCHAR sequence in strip */ 1637 newlen = 0; 1638 scans = g->strip + 1; 1639 scand = g->stripdata + 1; 1640 do { 1641 s = *scans++; 1642 d = *scand++; 1643 switch (s) { 1644 case OCHAR: /* sequence member */ 1645 if (newlen == 0) { /* new sequence */ 1646 newstarts = scans - 1; 1647 newstartd = scand - 1; 1648 } 1649 newlen++; 1650 break; 1651 case OPLUS_: /* things that don't break one */ 1652 case OLPAREN: 1653 case ORPAREN: 1654 break; 1655 case OQUEST_: /* things that must be skipped */ 1656 case OCH_: 1657 scans--; 1658 scand--; 1659 do { 1660 scans += d; 1661 scand += d; 1662 s = *scans; 1663 d = *scand; 1664 /* assert() interferes w debug printouts */ 1665 if (s != O_QUEST && s != O_CH && s != OOR2) { 1666 g->iflags |= BAD; 1667 return; 1668 } 1669 } while (s != O_QUEST && s != O_CH); 1670 /* fallthrough */ 1671 default: /* things that break a sequence */ 1672 if (newlen > g->mlen) { /* ends one */ 1673 starts = newstarts; 1674 startd = newstartd; 1675 g->mlen = newlen; 1676 } 1677 newlen = 0; 1678 break; 1679 } 1680 } while (s != OEND); 1681 1682 if (g->mlen == 0) /* there isn't one */ 1683 return; 1684 1685 /* turn it into a character string */ 1686 g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T)); 1687 if (g->must == NULL) { /* argh; just forget it */ 1688 g->mlen = 0; 1689 return; 1690 } 1691 cp = g->must; 1692 scans = starts; 1693 scand = startd; 1694 for (i = g->mlen; i > 0; i--) { 1695 for (;;) { 1696 s = *scans++; 1697 d = *scand++; 1698 if (s == OCHAR) 1699 break; 1700 } 1701 assert(cp < g->must + g->mlen); 1702 *cp++ = d; 1703 } 1704 assert(cp == g->must + g->mlen); 1705 *cp++ = '\0'; /* just on general principles */ 1706} 1707 1708/* 1709 - pluscount - count + nesting 1710 == static sopno pluscount(struct parse *p, struct re_guts *g); 1711 */ 1712static sopno /* nesting depth */ 1713pluscount(struct parse *p, struct re_guts *g) 1714{ 1715 sop *scan; 1716 sop s; 1717 sopno plusnest = 0; 1718 sopno maxnest = 0; 1719 1720 if (p->error != 0) 1721 return(0); /* there may not be an OEND */ 1722 1723 scan = g->strip + 1; 1724 do { 1725 s = *scan++; 1726 switch (s) { 1727 case OPLUS_: 1728 plusnest++; 1729 break; 1730 case O_PLUS: 1731 if (plusnest > maxnest) 1732 maxnest = plusnest; 1733 plusnest--; 1734 break; 1735 } 1736 } while (s != OEND); 1737 if (plusnest != 0) 1738 g->iflags |= BAD; 1739 return(maxnest); 1740} 1741