regcomp.c revision 131973
11573Srgrimes/*- 21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer. 31573Srgrimes * Copyright (c) 1992, 1993, 1994 41573Srgrimes * The Regents of the University of California. All rights reserved. 51573Srgrimes * 61573Srgrimes * This code is derived from software contributed to Berkeley by 71573Srgrimes * Henry Spencer. 81573Srgrimes * 91573Srgrimes * Redistribution and use in source and binary forms, with or without 101573Srgrimes * modification, are permitted provided that the following conditions 111573Srgrimes * are met: 121573Srgrimes * 1. Redistributions of source code must retain the above copyright 131573Srgrimes * notice, this list of conditions and the following disclaimer. 141573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 151573Srgrimes * notice, this list of conditions and the following disclaimer in the 161573Srgrimes * documentation and/or other materials provided with the distribution. 171573Srgrimes * 3. All advertising materials mentioning features or use of this software 181573Srgrimes * must display the following acknowledgement: 191573Srgrimes * This product includes software developed by the University of 201573Srgrimes * California, Berkeley and its contributors. 211573Srgrimes * 4. Neither the name of the University nor the names of its contributors 221573Srgrimes * may be used to endorse or promote products derived from this software 231573Srgrimes * without specific prior written permission. 241573Srgrimes * 251573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 261573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 271573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 281573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 291573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 301573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 311573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 321573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 331573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 341573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 351573Srgrimes * SUCH DAMAGE. 361573Srgrimes * 371573Srgrimes * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 381573Srgrimes */ 391573Srgrimes 401573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 411573Srgrimesstatic char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 421573Srgrimes#endif /* LIBC_SCCS and not lint */ 4392986Sobrien#include <sys/cdefs.h> 4492986Sobrien__FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 131973 2004-07-11 05:58:31Z tjr $"); 451573Srgrimes 461573Srgrimes#include <sys/types.h> 471573Srgrimes#include <stdio.h> 481573Srgrimes#include <string.h> 491573Srgrimes#include <ctype.h> 501573Srgrimes#include <limits.h> 511573Srgrimes#include <stdlib.h> 521573Srgrimes#include <regex.h> 531573Srgrimes 5419277Sache#include "collate.h" 5519277Sache 561573Srgrimes#include "utils.h" 571573Srgrimes#include "regex2.h" 581573Srgrimes 591573Srgrimes#include "cclass.h" 601573Srgrimes#include "cname.h" 611573Srgrimes 621573Srgrimes/* 631573Srgrimes * parse structure, passed up and down to avoid global variables and 641573Srgrimes * other clumsinesses 651573Srgrimes */ 661573Srgrimesstruct parse { 671573Srgrimes char *next; /* next character in RE */ 681573Srgrimes char *end; /* end of string (-> NUL normally) */ 691573Srgrimes int error; /* has an error been seen? */ 701573Srgrimes sop *strip; /* malloced strip */ 711573Srgrimes sopno ssize; /* malloced strip size (allocated) */ 721573Srgrimes sopno slen; /* malloced strip length (used) */ 731573Srgrimes int ncsalloc; /* number of csets allocated */ 741573Srgrimes struct re_guts *g; 751573Srgrimes# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 761573Srgrimes sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 771573Srgrimes sopno pend[NPAREN]; /* -> ) ([0] unused) */ 781573Srgrimes}; 791573Srgrimes 801573Srgrimes/* ========= begin header generated by ./mkh ========= */ 811573Srgrimes#ifdef __cplusplus 821573Srgrimesextern "C" { 831573Srgrimes#endif 841573Srgrimes 851573Srgrimes/* === regcomp.c === */ 8692905Sobrienstatic void p_ere(struct parse *p, int stop); 8792905Sobrienstatic void p_ere_exp(struct parse *p); 8892905Sobrienstatic void p_str(struct parse *p); 8992905Sobrienstatic void p_bre(struct parse *p, int end1, int end2); 9092905Sobrienstatic int p_simp_re(struct parse *p, int starordinary); 9192905Sobrienstatic int p_count(struct parse *p); 9292905Sobrienstatic void p_bracket(struct parse *p); 9392905Sobrienstatic void p_b_term(struct parse *p, cset *cs); 9492905Sobrienstatic void p_b_cclass(struct parse *p, cset *cs); 9592905Sobrienstatic void p_b_eclass(struct parse *p, cset *cs); 9692905Sobrienstatic char p_b_symbol(struct parse *p); 9792905Sobrienstatic char p_b_coll_elem(struct parse *p, int endc); 9892905Sobrienstatic char othercase(int ch); 9992905Sobrienstatic void bothcases(struct parse *p, int ch); 10092905Sobrienstatic void ordinary(struct parse *p, int ch); 10192905Sobrienstatic void nonnewline(struct parse *p); 10292905Sobrienstatic void repeat(struct parse *p, sopno start, int from, int to); 10392905Sobrienstatic int seterr(struct parse *p, int e); 10492905Sobrienstatic cset *allocset(struct parse *p); 10592905Sobrienstatic void freeset(struct parse *p, cset *cs); 10692905Sobrienstatic int freezeset(struct parse *p, cset *cs); 10792905Sobrienstatic int firstch(struct parse *p, cset *cs); 10892905Sobrienstatic int nch(struct parse *p, cset *cs); 10992905Sobrienstatic sopno dupl(struct parse *p, sopno start, sopno finish); 11092905Sobrienstatic void doemit(struct parse *p, sop op, size_t opnd); 11192905Sobrienstatic void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 11292905Sobrienstatic void dofwd(struct parse *p, sopno pos, sop value); 11392905Sobrienstatic void enlarge(struct parse *p, sopno size); 11492905Sobrienstatic void stripsnug(struct parse *p, struct re_guts *g); 11592905Sobrienstatic void findmust(struct parse *p, struct re_guts *g); 116131973Stjrstatic int altoffset(sop *scan, int offset); 11792905Sobrienstatic void computejumps(struct parse *p, struct re_guts *g); 11892905Sobrienstatic void computematchjumps(struct parse *p, struct re_guts *g); 11992905Sobrienstatic sopno pluscount(struct parse *p, struct re_guts *g); 1201573Srgrimes 1211573Srgrimes#ifdef __cplusplus 1221573Srgrimes} 1231573Srgrimes#endif 1241573Srgrimes/* ========= end header generated by ./mkh ========= */ 1251573Srgrimes 1261573Srgrimesstatic char nuls[10]; /* place to point scanner in event of error */ 1271573Srgrimes 1281573Srgrimes/* 1291573Srgrimes * macros for use with parse structure 1301573Srgrimes * BEWARE: these know that the parse structure is named `p' !!! 1311573Srgrimes */ 1321573Srgrimes#define PEEK() (*p->next) 1331573Srgrimes#define PEEK2() (*(p->next+1)) 1341573Srgrimes#define MORE() (p->next < p->end) 1351573Srgrimes#define MORE2() (p->next+1 < p->end) 1361573Srgrimes#define SEE(c) (MORE() && PEEK() == (c)) 1371573Srgrimes#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 1381573Srgrimes#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 1391573Srgrimes#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 1401573Srgrimes#define NEXT() (p->next++) 1411573Srgrimes#define NEXT2() (p->next += 2) 1421573Srgrimes#define NEXTn(n) (p->next += (n)) 1431573Srgrimes#define GETNEXT() (*p->next++) 1441573Srgrimes#define SETERROR(e) seterr(p, (e)) 1451573Srgrimes#define REQUIRE(co, e) ((co) || SETERROR(e)) 1461573Srgrimes#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 1471573Srgrimes#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 1481573Srgrimes#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 1491573Srgrimes#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 1501573Srgrimes#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 1511573Srgrimes#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 1521573Srgrimes#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 1531573Srgrimes#define HERE() (p->slen) 1541573Srgrimes#define THERE() (p->slen - 1) 1551573Srgrimes#define THERETHERE() (p->slen - 2) 1561573Srgrimes#define DROP(n) (p->slen -= (n)) 1571573Srgrimes 1581573Srgrimes#ifndef NDEBUG 1591573Srgrimesstatic int never = 0; /* for use in asserts; shuts lint up */ 1601573Srgrimes#else 1611573Srgrimes#define never 0 /* some <assert.h>s have bugs too */ 1621573Srgrimes#endif 1631573Srgrimes 16462232Sdcs/* Macro used by computejump()/computematchjump() */ 16562232Sdcs#define MIN(a,b) ((a)<(b)?(a):(b)) 16662232Sdcs 1671573Srgrimes/* 1681573Srgrimes - regcomp - interface for parser and compilation 1691573Srgrimes = extern int regcomp(regex_t *, const char *, int); 1701573Srgrimes = #define REG_BASIC 0000 1711573Srgrimes = #define REG_EXTENDED 0001 1721573Srgrimes = #define REG_ICASE 0002 1731573Srgrimes = #define REG_NOSUB 0004 1741573Srgrimes = #define REG_NEWLINE 0010 1751573Srgrimes = #define REG_NOSPEC 0020 1761573Srgrimes = #define REG_PEND 0040 1771573Srgrimes = #define REG_DUMP 0200 1781573Srgrimes */ 1791573Srgrimesint /* 0 success, otherwise REG_something */ 1801573Srgrimesregcomp(preg, pattern, cflags) 181104358Smikeregex_t * __restrict preg; 182104358Smikeconst char * __restrict pattern; 1831573Srgrimesint cflags; 1841573Srgrimes{ 1851573Srgrimes struct parse pa; 18692889Sobrien struct re_guts *g; 18792889Sobrien struct parse *p = &pa; 18892889Sobrien int i; 18992889Sobrien size_t len; 1901573Srgrimes#ifdef REDEBUG 1911573Srgrimes# define GOODFLAGS(f) (f) 1921573Srgrimes#else 1931573Srgrimes# define GOODFLAGS(f) ((f)&~REG_DUMP) 1941573Srgrimes#endif 1951573Srgrimes 1961573Srgrimes cflags = GOODFLAGS(cflags); 1971573Srgrimes if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 1981573Srgrimes return(REG_INVARG); 1991573Srgrimes 2001573Srgrimes if (cflags®_PEND) { 2011573Srgrimes if (preg->re_endp < pattern) 2021573Srgrimes return(REG_INVARG); 2031573Srgrimes len = preg->re_endp - pattern; 2041573Srgrimes } else 2051573Srgrimes len = strlen((char *)pattern); 2061573Srgrimes 2071573Srgrimes /* do the mallocs early so failure handling is easy */ 208131973Stjr g = (struct re_guts *)malloc(sizeof(struct re_guts)); 2091573Srgrimes if (g == NULL) 2101573Srgrimes return(REG_ESPACE); 2111573Srgrimes p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 2121573Srgrimes p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 2131573Srgrimes p->slen = 0; 2141573Srgrimes if (p->strip == NULL) { 2151573Srgrimes free((char *)g); 2161573Srgrimes return(REG_ESPACE); 2171573Srgrimes } 2181573Srgrimes 2191573Srgrimes /* set things up */ 2201573Srgrimes p->g = g; 2211573Srgrimes p->next = (char *)pattern; /* convenience; we do not modify it */ 2221573Srgrimes p->end = p->next + len; 2231573Srgrimes p->error = 0; 2241573Srgrimes p->ncsalloc = 0; 2251573Srgrimes for (i = 0; i < NPAREN; i++) { 2261573Srgrimes p->pbegin[i] = 0; 2271573Srgrimes p->pend[i] = 0; 2281573Srgrimes } 2291573Srgrimes g->csetsize = NC; 2301573Srgrimes g->sets = NULL; 2311573Srgrimes g->setbits = NULL; 2321573Srgrimes g->ncsets = 0; 2331573Srgrimes g->cflags = cflags; 2341573Srgrimes g->iflags = 0; 2351573Srgrimes g->nbol = 0; 2361573Srgrimes g->neol = 0; 2371573Srgrimes g->must = NULL; 23862391Sdcs g->moffset = -1; 23962263Sdcs g->charjump = NULL; 24062263Sdcs g->matchjump = NULL; 2411573Srgrimes g->mlen = 0; 2421573Srgrimes g->nsub = 0; 2431573Srgrimes g->backrefs = 0; 2441573Srgrimes 2451573Srgrimes /* do it */ 2461573Srgrimes EMIT(OEND, 0); 2471573Srgrimes g->firststate = THERE(); 2481573Srgrimes if (cflags®_EXTENDED) 2491573Srgrimes p_ere(p, OUT); 2501573Srgrimes else if (cflags®_NOSPEC) 2511573Srgrimes p_str(p); 2521573Srgrimes else 2531573Srgrimes p_bre(p, OUT, OUT); 2541573Srgrimes EMIT(OEND, 0); 2551573Srgrimes g->laststate = THERE(); 2561573Srgrimes 2571573Srgrimes /* tidy up loose ends and fill things in */ 2581573Srgrimes stripsnug(p, g); 2591573Srgrimes findmust(p, g); 26062232Sdcs /* only use Boyer-Moore algorithm if the pattern is bigger 26162232Sdcs * than three characters 26262232Sdcs */ 26362232Sdcs if(g->mlen > 3) { 26462232Sdcs computejumps(p, g); 26562232Sdcs computematchjumps(p, g); 26662755Sdcs if(g->matchjump == NULL && g->charjump != NULL) { 26762232Sdcs free(g->charjump); 26862232Sdcs g->charjump = NULL; 26962232Sdcs } 27062232Sdcs } 2711573Srgrimes g->nplus = pluscount(p, g); 2721573Srgrimes g->magic = MAGIC2; 2731573Srgrimes preg->re_nsub = g->nsub; 2741573Srgrimes preg->re_g = g; 2751573Srgrimes preg->re_magic = MAGIC1; 2761573Srgrimes#ifndef REDEBUG 2771573Srgrimes /* not debugging, so can't rely on the assert() in regexec() */ 2781573Srgrimes if (g->iflags&BAD) 2791573Srgrimes SETERROR(REG_ASSERT); 2801573Srgrimes#endif 2811573Srgrimes 2821573Srgrimes /* win or lose, we're done */ 2831573Srgrimes if (p->error != 0) /* lose */ 2841573Srgrimes regfree(preg); 2851573Srgrimes return(p->error); 2861573Srgrimes} 2871573Srgrimes 2881573Srgrimes/* 2891573Srgrimes - p_ere - ERE parser top level, concatenation and alternation 29092889Sobrien == static void p_ere(struct parse *p, int stop); 2911573Srgrimes */ 2921573Srgrimesstatic void 2931573Srgrimesp_ere(p, stop) 29492889Sobrienstruct parse *p; 2951573Srgrimesint stop; /* character this ERE should end at */ 2961573Srgrimes{ 29792889Sobrien char c; 29892889Sobrien sopno prevback; 29992889Sobrien sopno prevfwd; 30092889Sobrien sopno conc; 30192889Sobrien int first = 1; /* is this the first alternative? */ 3021573Srgrimes 3031573Srgrimes for (;;) { 3041573Srgrimes /* do a bunch of concatenated expressions */ 3051573Srgrimes conc = HERE(); 3061573Srgrimes while (MORE() && (c = PEEK()) != '|' && c != stop) 3071573Srgrimes p_ere_exp(p); 30817141Sjkh (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 3091573Srgrimes 3101573Srgrimes if (!EAT('|')) 3111573Srgrimes break; /* NOTE BREAK OUT */ 3121573Srgrimes 3131573Srgrimes if (first) { 3141573Srgrimes INSERT(OCH_, conc); /* offset is wrong */ 3151573Srgrimes prevfwd = conc; 3161573Srgrimes prevback = conc; 3171573Srgrimes first = 0; 3181573Srgrimes } 3191573Srgrimes ASTERN(OOR1, prevback); 3201573Srgrimes prevback = THERE(); 3211573Srgrimes AHEAD(prevfwd); /* fix previous offset */ 3221573Srgrimes prevfwd = HERE(); 3231573Srgrimes EMIT(OOR2, 0); /* offset is very wrong */ 3241573Srgrimes } 3251573Srgrimes 3261573Srgrimes if (!first) { /* tail-end fixups */ 3271573Srgrimes AHEAD(prevfwd); 3281573Srgrimes ASTERN(O_CH, prevback); 3291573Srgrimes } 3301573Srgrimes 3311573Srgrimes assert(!MORE() || SEE(stop)); 3321573Srgrimes} 3331573Srgrimes 3341573Srgrimes/* 3351573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 33692889Sobrien == static void p_ere_exp(struct parse *p); 3371573Srgrimes */ 3381573Srgrimesstatic void 3391573Srgrimesp_ere_exp(p) 34092889Sobrienstruct parse *p; 3411573Srgrimes{ 34292889Sobrien char c; 34392889Sobrien sopno pos; 34492889Sobrien int count; 34592889Sobrien int count2; 34692889Sobrien sopno subno; 3471573Srgrimes int wascaret = 0; 3481573Srgrimes 3491573Srgrimes assert(MORE()); /* caller should have ensured this */ 3501573Srgrimes c = GETNEXT(); 3511573Srgrimes 3521573Srgrimes pos = HERE(); 3531573Srgrimes switch (c) { 3541573Srgrimes case '(': 35517141Sjkh (void)REQUIRE(MORE(), REG_EPAREN); 3561573Srgrimes p->g->nsub++; 3571573Srgrimes subno = p->g->nsub; 3581573Srgrimes if (subno < NPAREN) 3591573Srgrimes p->pbegin[subno] = HERE(); 3601573Srgrimes EMIT(OLPAREN, subno); 3611573Srgrimes if (!SEE(')')) 3621573Srgrimes p_ere(p, ')'); 3631573Srgrimes if (subno < NPAREN) { 3641573Srgrimes p->pend[subno] = HERE(); 3651573Srgrimes assert(p->pend[subno] != 0); 3661573Srgrimes } 3671573Srgrimes EMIT(ORPAREN, subno); 36817141Sjkh (void)MUSTEAT(')', REG_EPAREN); 3691573Srgrimes break; 3701573Srgrimes#ifndef POSIX_MISTAKE 3711573Srgrimes case ')': /* happens only if no current unmatched ( */ 3721573Srgrimes /* 3731573Srgrimes * You may ask, why the ifndef? Because I didn't notice 3741573Srgrimes * this until slightly too late for 1003.2, and none of the 3751573Srgrimes * other 1003.2 regular-expression reviewers noticed it at 3761573Srgrimes * all. So an unmatched ) is legal POSIX, at least until 3771573Srgrimes * we can get it fixed. 3781573Srgrimes */ 3791573Srgrimes SETERROR(REG_EPAREN); 3801573Srgrimes break; 3811573Srgrimes#endif 3821573Srgrimes case '^': 3831573Srgrimes EMIT(OBOL, 0); 3841573Srgrimes p->g->iflags |= USEBOL; 3851573Srgrimes p->g->nbol++; 3861573Srgrimes wascaret = 1; 3871573Srgrimes break; 3881573Srgrimes case '$': 3891573Srgrimes EMIT(OEOL, 0); 3901573Srgrimes p->g->iflags |= USEEOL; 3911573Srgrimes p->g->neol++; 3921573Srgrimes break; 3931573Srgrimes case '|': 3941573Srgrimes SETERROR(REG_EMPTY); 3951573Srgrimes break; 3961573Srgrimes case '*': 3971573Srgrimes case '+': 3981573Srgrimes case '?': 3991573Srgrimes SETERROR(REG_BADRPT); 4001573Srgrimes break; 4011573Srgrimes case '.': 4021573Srgrimes if (p->g->cflags®_NEWLINE) 4031573Srgrimes nonnewline(p); 4041573Srgrimes else 4051573Srgrimes EMIT(OANY, 0); 4061573Srgrimes break; 4071573Srgrimes case '[': 4081573Srgrimes p_bracket(p); 4091573Srgrimes break; 4101573Srgrimes case '\\': 41117141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 4121573Srgrimes c = GETNEXT(); 4131573Srgrimes ordinary(p, c); 4141573Srgrimes break; 4151573Srgrimes case '{': /* okay as ordinary except if digit follows */ 41649094Sache (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 4171573Srgrimes /* FALLTHROUGH */ 4181573Srgrimes default: 4191573Srgrimes ordinary(p, c); 4201573Srgrimes break; 4211573Srgrimes } 4221573Srgrimes 4231573Srgrimes if (!MORE()) 4241573Srgrimes return; 4251573Srgrimes c = PEEK(); 4261573Srgrimes /* we call { a repetition if followed by a digit */ 4271573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 42849094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 4291573Srgrimes return; /* no repetition, we're done */ 4301573Srgrimes NEXT(); 4311573Srgrimes 43217141Sjkh (void)REQUIRE(!wascaret, REG_BADRPT); 4331573Srgrimes switch (c) { 4341573Srgrimes case '*': /* implemented as +? */ 4351573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 4361573Srgrimes INSERT(OPLUS_, pos); 4371573Srgrimes ASTERN(O_PLUS, pos); 4381573Srgrimes INSERT(OQUEST_, pos); 4391573Srgrimes ASTERN(O_QUEST, pos); 4401573Srgrimes break; 4411573Srgrimes case '+': 4421573Srgrimes INSERT(OPLUS_, pos); 4431573Srgrimes ASTERN(O_PLUS, pos); 4441573Srgrimes break; 4451573Srgrimes case '?': 4461573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 4471573Srgrimes INSERT(OCH_, pos); /* offset slightly wrong */ 4481573Srgrimes ASTERN(OOR1, pos); /* this one's right */ 4491573Srgrimes AHEAD(pos); /* fix the OCH_ */ 4501573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 4511573Srgrimes AHEAD(THERE()); /* ...so fix it */ 4521573Srgrimes ASTERN(O_CH, THERETHERE()); 4531573Srgrimes break; 4541573Srgrimes case '{': 4551573Srgrimes count = p_count(p); 4561573Srgrimes if (EAT(',')) { 45749094Sache if (isdigit((uch)PEEK())) { 4581573Srgrimes count2 = p_count(p); 45917141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 4601573Srgrimes } else /* single number with comma */ 4611573Srgrimes count2 = INFINITY; 4621573Srgrimes } else /* just a single number */ 4631573Srgrimes count2 = count; 4641573Srgrimes repeat(p, pos, count, count2); 4651573Srgrimes if (!EAT('}')) { /* error heuristics */ 4661573Srgrimes while (MORE() && PEEK() != '}') 4671573Srgrimes NEXT(); 46817141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 4691573Srgrimes SETERROR(REG_BADBR); 4701573Srgrimes } 4711573Srgrimes break; 4721573Srgrimes } 4731573Srgrimes 4741573Srgrimes if (!MORE()) 4751573Srgrimes return; 4761573Srgrimes c = PEEK(); 4771573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 47849094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 4791573Srgrimes return; 4801573Srgrimes SETERROR(REG_BADRPT); 4811573Srgrimes} 4821573Srgrimes 4831573Srgrimes/* 4841573Srgrimes - p_str - string (no metacharacters) "parser" 48592889Sobrien == static void p_str(struct parse *p); 4861573Srgrimes */ 4871573Srgrimesstatic void 4881573Srgrimesp_str(p) 48992889Sobrienstruct parse *p; 4901573Srgrimes{ 49117141Sjkh (void)REQUIRE(MORE(), REG_EMPTY); 4921573Srgrimes while (MORE()) 4931573Srgrimes ordinary(p, GETNEXT()); 4941573Srgrimes} 4951573Srgrimes 4961573Srgrimes/* 4971573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation 49892889Sobrien == static void p_bre(struct parse *p, int end1, \ 49992889Sobrien == int end2); 5001573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check. 5011573Srgrimes * 5021573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first 503131973Stjr * taken as an ordinary character and then revised to be an anchor. 5041573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive. 5051573Srgrimes */ 5061573Srgrimesstatic void 5071573Srgrimesp_bre(p, end1, end2) 50892889Sobrienstruct parse *p; 50992889Sobrienint end1; /* first terminating character */ 51092889Sobrienint end2; /* second terminating character */ 5111573Srgrimes{ 51292889Sobrien sopno start = HERE(); 51392889Sobrien int first = 1; /* first subexpression? */ 51492889Sobrien int wasdollar = 0; 5151573Srgrimes 5161573Srgrimes if (EAT('^')) { 5171573Srgrimes EMIT(OBOL, 0); 5181573Srgrimes p->g->iflags |= USEBOL; 5191573Srgrimes p->g->nbol++; 5201573Srgrimes } 5211573Srgrimes while (MORE() && !SEETWO(end1, end2)) { 5221573Srgrimes wasdollar = p_simp_re(p, first); 5231573Srgrimes first = 0; 5241573Srgrimes } 5251573Srgrimes if (wasdollar) { /* oops, that was a trailing anchor */ 5261573Srgrimes DROP(1); 5271573Srgrimes EMIT(OEOL, 0); 5281573Srgrimes p->g->iflags |= USEEOL; 5291573Srgrimes p->g->neol++; 5301573Srgrimes } 5311573Srgrimes 53217141Sjkh (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 5331573Srgrimes} 5341573Srgrimes 5351573Srgrimes/* 5361573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 53792889Sobrien == static int p_simp_re(struct parse *p, int starordinary); 5381573Srgrimes */ 5391573Srgrimesstatic int /* was the simple RE an unbackslashed $? */ 5401573Srgrimesp_simp_re(p, starordinary) 54192889Sobrienstruct parse *p; 5421573Srgrimesint starordinary; /* is a leading * an ordinary character? */ 5431573Srgrimes{ 54492889Sobrien int c; 54592889Sobrien int count; 54692889Sobrien int count2; 54792889Sobrien sopno pos; 54892889Sobrien int i; 54992889Sobrien sopno subno; 5501573Srgrimes# define BACKSL (1<<CHAR_BIT) 5511573Srgrimes 5521573Srgrimes pos = HERE(); /* repetion op, if any, covers from here */ 5531573Srgrimes 5541573Srgrimes assert(MORE()); /* caller should have ensured this */ 5551573Srgrimes c = GETNEXT(); 5561573Srgrimes if (c == '\\') { 55717141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 55849094Sache c = BACKSL | GETNEXT(); 5591573Srgrimes } 5601573Srgrimes switch (c) { 5611573Srgrimes case '.': 5621573Srgrimes if (p->g->cflags®_NEWLINE) 5631573Srgrimes nonnewline(p); 5641573Srgrimes else 5651573Srgrimes EMIT(OANY, 0); 5661573Srgrimes break; 5671573Srgrimes case '[': 5681573Srgrimes p_bracket(p); 5691573Srgrimes break; 5701573Srgrimes case BACKSL|'{': 5711573Srgrimes SETERROR(REG_BADRPT); 5721573Srgrimes break; 5731573Srgrimes case BACKSL|'(': 5741573Srgrimes p->g->nsub++; 5751573Srgrimes subno = p->g->nsub; 5761573Srgrimes if (subno < NPAREN) 5771573Srgrimes p->pbegin[subno] = HERE(); 5781573Srgrimes EMIT(OLPAREN, subno); 5791573Srgrimes /* the MORE here is an error heuristic */ 5801573Srgrimes if (MORE() && !SEETWO('\\', ')')) 5811573Srgrimes p_bre(p, '\\', ')'); 5821573Srgrimes if (subno < NPAREN) { 5831573Srgrimes p->pend[subno] = HERE(); 5841573Srgrimes assert(p->pend[subno] != 0); 5851573Srgrimes } 5861573Srgrimes EMIT(ORPAREN, subno); 58717141Sjkh (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 5881573Srgrimes break; 5891573Srgrimes case BACKSL|')': /* should not get here -- must be user */ 5901573Srgrimes case BACKSL|'}': 5911573Srgrimes SETERROR(REG_EPAREN); 5921573Srgrimes break; 5931573Srgrimes case BACKSL|'1': 5941573Srgrimes case BACKSL|'2': 5951573Srgrimes case BACKSL|'3': 5961573Srgrimes case BACKSL|'4': 5971573Srgrimes case BACKSL|'5': 5981573Srgrimes case BACKSL|'6': 5991573Srgrimes case BACKSL|'7': 6001573Srgrimes case BACKSL|'8': 6011573Srgrimes case BACKSL|'9': 6021573Srgrimes i = (c&~BACKSL) - '0'; 6031573Srgrimes assert(i < NPAREN); 6041573Srgrimes if (p->pend[i] != 0) { 6051573Srgrimes assert(i <= p->g->nsub); 6061573Srgrimes EMIT(OBACK_, i); 6071573Srgrimes assert(p->pbegin[i] != 0); 6081573Srgrimes assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 6091573Srgrimes assert(OP(p->strip[p->pend[i]]) == ORPAREN); 6101573Srgrimes (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 6111573Srgrimes EMIT(O_BACK, i); 6121573Srgrimes } else 6131573Srgrimes SETERROR(REG_ESUBREG); 6141573Srgrimes p->g->backrefs = 1; 6151573Srgrimes break; 6161573Srgrimes case '*': 61717141Sjkh (void)REQUIRE(starordinary, REG_BADRPT); 6181573Srgrimes /* FALLTHROUGH */ 6191573Srgrimes default: 62049094Sache ordinary(p, (char)c); 6211573Srgrimes break; 6221573Srgrimes } 6231573Srgrimes 6241573Srgrimes if (EAT('*')) { /* implemented as +? */ 6251573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 6261573Srgrimes INSERT(OPLUS_, pos); 6271573Srgrimes ASTERN(O_PLUS, pos); 6281573Srgrimes INSERT(OQUEST_, pos); 6291573Srgrimes ASTERN(O_QUEST, pos); 6301573Srgrimes } else if (EATTWO('\\', '{')) { 6311573Srgrimes count = p_count(p); 6321573Srgrimes if (EAT(',')) { 63349094Sache if (MORE() && isdigit((uch)PEEK())) { 6341573Srgrimes count2 = p_count(p); 63517141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 6361573Srgrimes } else /* single number with comma */ 6371573Srgrimes count2 = INFINITY; 6381573Srgrimes } else /* just a single number */ 6391573Srgrimes count2 = count; 6401573Srgrimes repeat(p, pos, count, count2); 6411573Srgrimes if (!EATTWO('\\', '}')) { /* error heuristics */ 6421573Srgrimes while (MORE() && !SEETWO('\\', '}')) 6431573Srgrimes NEXT(); 64417141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 6451573Srgrimes SETERROR(REG_BADBR); 6461573Srgrimes } 64749094Sache } else if (c == '$') /* $ (but not \$) ends it */ 6481573Srgrimes return(1); 6491573Srgrimes 6501573Srgrimes return(0); 6511573Srgrimes} 6521573Srgrimes 6531573Srgrimes/* 6541573Srgrimes - p_count - parse a repetition count 65592889Sobrien == static int p_count(struct parse *p); 6561573Srgrimes */ 6571573Srgrimesstatic int /* the value */ 6581573Srgrimesp_count(p) 65992889Sobrienstruct parse *p; 6601573Srgrimes{ 66192889Sobrien int count = 0; 66292889Sobrien int ndigits = 0; 6631573Srgrimes 66449094Sache while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 6651573Srgrimes count = count*10 + (GETNEXT() - '0'); 6661573Srgrimes ndigits++; 6671573Srgrimes } 6681573Srgrimes 66917141Sjkh (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 6701573Srgrimes return(count); 6711573Srgrimes} 6721573Srgrimes 6731573Srgrimes/* 6741573Srgrimes - p_bracket - parse a bracketed character list 67592889Sobrien == static void p_bracket(struct parse *p); 6761573Srgrimes * 6771573Srgrimes * Note a significant property of this code: if the allocset() did SETERROR, 6781573Srgrimes * no set operations are done. 6791573Srgrimes */ 6801573Srgrimesstatic void 6811573Srgrimesp_bracket(p) 68292889Sobrienstruct parse *p; 6831573Srgrimes{ 68492889Sobrien cset *cs = allocset(p); 68592889Sobrien int invert = 0; 6861573Srgrimes 6871573Srgrimes /* Dept of Truly Sickening Special-Case Kludges */ 6881573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 6891573Srgrimes EMIT(OBOW, 0); 6901573Srgrimes NEXTn(6); 6911573Srgrimes return; 6921573Srgrimes } 6931573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 6941573Srgrimes EMIT(OEOW, 0); 6951573Srgrimes NEXTn(6); 6961573Srgrimes return; 6971573Srgrimes } 6981573Srgrimes 6991573Srgrimes if (EAT('^')) 7001573Srgrimes invert++; /* make note to invert set at end */ 7011573Srgrimes if (EAT(']')) 7021573Srgrimes CHadd(cs, ']'); 7031573Srgrimes else if (EAT('-')) 7041573Srgrimes CHadd(cs, '-'); 7051573Srgrimes while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 7061573Srgrimes p_b_term(p, cs); 7071573Srgrimes if (EAT('-')) 7081573Srgrimes CHadd(cs, '-'); 70917141Sjkh (void)MUSTEAT(']', REG_EBRACK); 7101573Srgrimes 7111573Srgrimes if (p->error != 0) /* don't mess things up further */ 7121573Srgrimes return; 7131573Srgrimes 7141573Srgrimes if (p->g->cflags®_ICASE) { 71592889Sobrien int i; 71692889Sobrien int ci; 7171573Srgrimes 7181573Srgrimes for (i = p->g->csetsize - 1; i >= 0; i--) 7191573Srgrimes if (CHIN(cs, i) && isalpha(i)) { 7201573Srgrimes ci = othercase(i); 7211573Srgrimes if (ci != i) 7221573Srgrimes CHadd(cs, ci); 7231573Srgrimes } 7241573Srgrimes } 7251573Srgrimes if (invert) { 72692889Sobrien int i; 7271573Srgrimes 7281573Srgrimes for (i = p->g->csetsize - 1; i >= 0; i--) 7291573Srgrimes if (CHIN(cs, i)) 7301573Srgrimes CHsub(cs, i); 7311573Srgrimes else 7321573Srgrimes CHadd(cs, i); 7331573Srgrimes if (p->g->cflags®_NEWLINE) 7341573Srgrimes CHsub(cs, '\n'); 7351573Srgrimes } 7361573Srgrimes 7371573Srgrimes if (nch(p, cs) == 1) { /* optimize singleton sets */ 7381573Srgrimes ordinary(p, firstch(p, cs)); 7391573Srgrimes freeset(p, cs); 7401573Srgrimes } else 7411573Srgrimes EMIT(OANYOF, freezeset(p, cs)); 7421573Srgrimes} 7431573Srgrimes 7441573Srgrimes/* 7451573Srgrimes - p_b_term - parse one term of a bracketed character list 74692889Sobrien == static void p_b_term(struct parse *p, cset *cs); 7471573Srgrimes */ 7481573Srgrimesstatic void 7491573Srgrimesp_b_term(p, cs) 75092889Sobrienstruct parse *p; 75192889Sobriencset *cs; 7521573Srgrimes{ 75392889Sobrien char c; 75492889Sobrien char start, finish; 75592889Sobrien int i; 7561573Srgrimes 7571573Srgrimes /* classify what we've got */ 7581573Srgrimes switch ((MORE()) ? PEEK() : '\0') { 7591573Srgrimes case '[': 7601573Srgrimes c = (MORE2()) ? PEEK2() : '\0'; 7611573Srgrimes break; 7621573Srgrimes case '-': 7631573Srgrimes SETERROR(REG_ERANGE); 7641573Srgrimes return; /* NOTE RETURN */ 7651573Srgrimes break; 7661573Srgrimes default: 7671573Srgrimes c = '\0'; 7681573Srgrimes break; 7691573Srgrimes } 7701573Srgrimes 7711573Srgrimes switch (c) { 7721573Srgrimes case ':': /* character class */ 7731573Srgrimes NEXT2(); 77417141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 7751573Srgrimes c = PEEK(); 77617141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 7771573Srgrimes p_b_cclass(p, cs); 77817141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 77917141Sjkh (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 7801573Srgrimes break; 7811573Srgrimes case '=': /* equivalence class */ 7821573Srgrimes NEXT2(); 78317141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 7841573Srgrimes c = PEEK(); 78517141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 7861573Srgrimes p_b_eclass(p, cs); 78717141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 78817141Sjkh (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 7891573Srgrimes break; 7901573Srgrimes default: /* symbol, ordinary character, or range */ 7911573Srgrimes start = p_b_symbol(p); 7921573Srgrimes if (SEE('-') && MORE2() && PEEK2() != ']') { 7931573Srgrimes /* range */ 7941573Srgrimes NEXT(); 7951573Srgrimes if (EAT('-')) 7961573Srgrimes finish = '-'; 7971573Srgrimes else 7981573Srgrimes finish = p_b_symbol(p); 7991573Srgrimes } else 8001573Srgrimes finish = start; 80117514Sache if (start == finish) 80217514Sache CHadd(cs, start); 80317514Sache else { 80424637Sache if (__collate_load_error) { 80524637Sache (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 80624637Sache for (i = (uch)start; i <= (uch)finish; i++) 80717514Sache CHadd(cs, i); 80824637Sache } else { 80924637Sache (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 81024637Sache for (i = CHAR_MIN; i <= CHAR_MAX; i++) { 81124637Sache if ( __collate_range_cmp(start, i) <= 0 81224637Sache && __collate_range_cmp(i, finish) <= 0 81324637Sache ) 81424637Sache CHadd(cs, i); 81524637Sache } 81617514Sache } 81717514Sache } 8181573Srgrimes break; 8191573Srgrimes } 8201573Srgrimes} 8211573Srgrimes 8221573Srgrimes/* 8231573Srgrimes - p_b_cclass - parse a character-class name and deal with it 82492889Sobrien == static void p_b_cclass(struct parse *p, cset *cs); 8251573Srgrimes */ 8261573Srgrimesstatic void 8271573Srgrimesp_b_cclass(p, cs) 82892889Sobrienstruct parse *p; 82992889Sobriencset *cs; 8301573Srgrimes{ 83192889Sobrien int c; 83292889Sobrien char *sp = p->next; 83392889Sobrien struct cclass *cp; 83492889Sobrien size_t len; 8351573Srgrimes 83617508Sache while (MORE() && isalpha((uch)PEEK())) 8371573Srgrimes NEXT(); 8381573Srgrimes len = p->next - sp; 8391573Srgrimes for (cp = cclasses; cp->name != NULL; cp++) 8401573Srgrimes if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 8411573Srgrimes break; 8421573Srgrimes if (cp->name == NULL) { 8431573Srgrimes /* oops, didn't find it */ 8441573Srgrimes SETERROR(REG_ECTYPE); 8451573Srgrimes return; 8461573Srgrimes } 8471573Srgrimes 84817508Sache switch (cp->fidx) { 84917508Sache case CALNUM: 85017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 85117508Sache if (isalnum((uch)c)) 85217508Sache CHadd(cs, c); 85317508Sache break; 85417508Sache case CALPHA: 85517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 85617508Sache if (isalpha((uch)c)) 85717508Sache CHadd(cs, c); 85817508Sache break; 85917508Sache case CBLANK: 86017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 86117508Sache if (isblank((uch)c)) 86217508Sache CHadd(cs, c); 86317508Sache break; 86417508Sache case CCNTRL: 86517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 86617508Sache if (iscntrl((uch)c)) 86717508Sache CHadd(cs, c); 86817508Sache break; 86917508Sache case CDIGIT: 87017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 87117508Sache if (isdigit((uch)c)) 87217508Sache CHadd(cs, c); 87317508Sache break; 87417508Sache case CGRAPH: 87517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 87617508Sache if (isgraph((uch)c)) 87717508Sache CHadd(cs, c); 87817508Sache break; 87917508Sache case CLOWER: 88017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 88117508Sache if (islower((uch)c)) 88217508Sache CHadd(cs, c); 88317508Sache break; 88417508Sache case CPRINT: 88517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 88617508Sache if (isprint((uch)c)) 88717508Sache CHadd(cs, c); 88817508Sache break; 88917508Sache case CPUNCT: 89017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 89117508Sache if (ispunct((uch)c)) 89217508Sache CHadd(cs, c); 89317508Sache break; 89417508Sache case CSPACE: 89517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 89617508Sache if (isspace((uch)c)) 89717508Sache CHadd(cs, c); 89817508Sache break; 89917508Sache case CUPPER: 90017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 90117508Sache if (isupper((uch)c)) 90217508Sache CHadd(cs, c); 90317508Sache break; 90417508Sache case CXDIGIT: 90517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 90617508Sache if (isxdigit((uch)c)) 90717508Sache CHadd(cs, c); 90817508Sache break; 90917508Sache } 9101573Srgrimes} 9111573Srgrimes 9121573Srgrimes/* 9131573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it 91492889Sobrien == static void p_b_eclass(struct parse *p, cset *cs); 9151573Srgrimes * 9161573Srgrimes * This implementation is incomplete. xxx 9171573Srgrimes */ 9181573Srgrimesstatic void 9191573Srgrimesp_b_eclass(p, cs) 92092889Sobrienstruct parse *p; 92192889Sobriencset *cs; 9221573Srgrimes{ 92392889Sobrien char c; 9241573Srgrimes 9251573Srgrimes c = p_b_coll_elem(p, '='); 9261573Srgrimes CHadd(cs, c); 9271573Srgrimes} 9281573Srgrimes 9291573Srgrimes/* 9301573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 93192889Sobrien == static char p_b_symbol(struct parse *p); 9321573Srgrimes */ 9331573Srgrimesstatic char /* value of symbol */ 9341573Srgrimesp_b_symbol(p) 93592889Sobrienstruct parse *p; 9361573Srgrimes{ 93792889Sobrien char value; 9381573Srgrimes 93917141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 9401573Srgrimes if (!EATTWO('[', '.')) 9411573Srgrimes return(GETNEXT()); 9421573Srgrimes 9431573Srgrimes /* collating symbol */ 9441573Srgrimes value = p_b_coll_elem(p, '.'); 94517141Sjkh (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 9461573Srgrimes return(value); 9471573Srgrimes} 9481573Srgrimes 9491573Srgrimes/* 9501573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up 95192889Sobrien == static char p_b_coll_elem(struct parse *p, int endc); 9521573Srgrimes */ 9531573Srgrimesstatic char /* value of collating element */ 9541573Srgrimesp_b_coll_elem(p, endc) 95592889Sobrienstruct parse *p; 9561573Srgrimesint endc; /* name ended by endc,']' */ 9571573Srgrimes{ 95892889Sobrien char *sp = p->next; 95992889Sobrien struct cname *cp; 96092889Sobrien int len; 9611573Srgrimes 9621573Srgrimes while (MORE() && !SEETWO(endc, ']')) 9631573Srgrimes NEXT(); 9641573Srgrimes if (!MORE()) { 9651573Srgrimes SETERROR(REG_EBRACK); 9661573Srgrimes return(0); 9671573Srgrimes } 9681573Srgrimes len = p->next - sp; 9691573Srgrimes for (cp = cnames; cp->name != NULL; cp++) 9701573Srgrimes if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 9711573Srgrimes return(cp->code); /* known name */ 9721573Srgrimes if (len == 1) 9731573Srgrimes return(*sp); /* single character */ 9741573Srgrimes SETERROR(REG_ECOLLATE); /* neither */ 9751573Srgrimes return(0); 9761573Srgrimes} 9771573Srgrimes 9781573Srgrimes/* 9791573Srgrimes - othercase - return the case counterpart of an alphabetic 9801573Srgrimes == static char othercase(int ch); 9811573Srgrimes */ 9821573Srgrimesstatic char /* if no counterpart, return ch */ 9831573Srgrimesothercase(ch) 9841573Srgrimesint ch; 9851573Srgrimes{ 98649094Sache ch = (uch)ch; 9871573Srgrimes assert(isalpha(ch)); 9881573Srgrimes if (isupper(ch)) 9891573Srgrimes return(tolower(ch)); 9901573Srgrimes else if (islower(ch)) 9911573Srgrimes return(toupper(ch)); 9921573Srgrimes else /* peculiar, but could happen */ 9931573Srgrimes return(ch); 9941573Srgrimes} 9951573Srgrimes 9961573Srgrimes/* 9971573Srgrimes - bothcases - emit a dualcase version of a two-case character 99892889Sobrien == static void bothcases(struct parse *p, int ch); 9991573Srgrimes * 10001573Srgrimes * Boy, is this implementation ever a kludge... 10011573Srgrimes */ 10021573Srgrimesstatic void 10031573Srgrimesbothcases(p, ch) 100492889Sobrienstruct parse *p; 10051573Srgrimesint ch; 10061573Srgrimes{ 100792889Sobrien char *oldnext = p->next; 100892889Sobrien char *oldend = p->end; 10091573Srgrimes char bracket[3]; 10101573Srgrimes 101149094Sache ch = (uch)ch; 10121573Srgrimes assert(othercase(ch) != ch); /* p_bracket() would recurse */ 10131573Srgrimes p->next = bracket; 10141573Srgrimes p->end = bracket+2; 10151573Srgrimes bracket[0] = ch; 10161573Srgrimes bracket[1] = ']'; 10171573Srgrimes bracket[2] = '\0'; 10181573Srgrimes p_bracket(p); 10191573Srgrimes assert(p->next == bracket+2); 10201573Srgrimes p->next = oldnext; 10211573Srgrimes p->end = oldend; 10221573Srgrimes} 10231573Srgrimes 10241573Srgrimes/* 10251573Srgrimes - ordinary - emit an ordinary character 102692889Sobrien == static void ordinary(struct parse *p, int ch); 10271573Srgrimes */ 10281573Srgrimesstatic void 10291573Srgrimesordinary(p, ch) 103092889Sobrienstruct parse *p; 103192889Sobrienint ch; 10321573Srgrimes{ 10331573Srgrimes 103449094Sache if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 10351573Srgrimes bothcases(p, ch); 1036131973Stjr else 103749094Sache EMIT(OCHAR, (uch)ch); 10381573Srgrimes} 10391573Srgrimes 10401573Srgrimes/* 10411573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY 104292889Sobrien == static void nonnewline(struct parse *p); 10431573Srgrimes * 10441573Srgrimes * Boy, is this implementation ever a kludge... 10451573Srgrimes */ 10461573Srgrimesstatic void 10471573Srgrimesnonnewline(p) 104892889Sobrienstruct parse *p; 10491573Srgrimes{ 105092889Sobrien char *oldnext = p->next; 105192889Sobrien char *oldend = p->end; 10521573Srgrimes char bracket[4]; 10531573Srgrimes 10541573Srgrimes p->next = bracket; 10551573Srgrimes p->end = bracket+3; 10561573Srgrimes bracket[0] = '^'; 10571573Srgrimes bracket[1] = '\n'; 10581573Srgrimes bracket[2] = ']'; 10591573Srgrimes bracket[3] = '\0'; 10601573Srgrimes p_bracket(p); 10611573Srgrimes assert(p->next == bracket+3); 10621573Srgrimes p->next = oldnext; 10631573Srgrimes p->end = oldend; 10641573Srgrimes} 10651573Srgrimes 10661573Srgrimes/* 10671573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed 106892889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to); 10691573Srgrimes */ 10701573Srgrimesstatic void 10711573Srgrimesrepeat(p, start, from, to) 107292889Sobrienstruct parse *p; 10731573Srgrimessopno start; /* operand from here to end of strip */ 10741573Srgrimesint from; /* repeated from this number */ 10751573Srgrimesint to; /* to this number of times (maybe INFINITY) */ 10761573Srgrimes{ 107792889Sobrien sopno finish = HERE(); 10781573Srgrimes# define N 2 10791573Srgrimes# define INF 3 10801573Srgrimes# define REP(f, t) ((f)*8 + (t)) 10811573Srgrimes# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 108292889Sobrien sopno copy; 10831573Srgrimes 10841573Srgrimes if (p->error != 0) /* head off possible runaway recursion */ 10851573Srgrimes return; 10861573Srgrimes 10871573Srgrimes assert(from <= to); 10881573Srgrimes 10891573Srgrimes switch (REP(MAP(from), MAP(to))) { 10901573Srgrimes case REP(0, 0): /* must be user doing this */ 10911573Srgrimes DROP(finish-start); /* drop the operand */ 10921573Srgrimes break; 10931573Srgrimes case REP(0, 1): /* as x{1,1}? */ 10941573Srgrimes case REP(0, N): /* as x{1,n}? */ 10951573Srgrimes case REP(0, INF): /* as x{1,}? */ 10961573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10971573Srgrimes INSERT(OCH_, start); /* offset is wrong... */ 10981573Srgrimes repeat(p, start+1, 1, to); 10991573Srgrimes ASTERN(OOR1, start); 11001573Srgrimes AHEAD(start); /* ... fix it */ 11011573Srgrimes EMIT(OOR2, 0); 11021573Srgrimes AHEAD(THERE()); 11031573Srgrimes ASTERN(O_CH, THERETHERE()); 11041573Srgrimes break; 11051573Srgrimes case REP(1, 1): /* trivial case */ 11061573Srgrimes /* done */ 11071573Srgrimes break; 11081573Srgrimes case REP(1, N): /* as x?x{1,n-1} */ 11091573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 11101573Srgrimes INSERT(OCH_, start); 11111573Srgrimes ASTERN(OOR1, start); 11121573Srgrimes AHEAD(start); 11131573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 11141573Srgrimes AHEAD(THERE()); /* ...so fix it */ 11151573Srgrimes ASTERN(O_CH, THERETHERE()); 11161573Srgrimes copy = dupl(p, start+1, finish+1); 11171573Srgrimes assert(copy == finish+4); 11181573Srgrimes repeat(p, copy, 1, to-1); 11191573Srgrimes break; 11201573Srgrimes case REP(1, INF): /* as x+ */ 11211573Srgrimes INSERT(OPLUS_, start); 11221573Srgrimes ASTERN(O_PLUS, start); 11231573Srgrimes break; 11241573Srgrimes case REP(N, N): /* as xx{m-1,n-1} */ 11251573Srgrimes copy = dupl(p, start, finish); 11261573Srgrimes repeat(p, copy, from-1, to-1); 11271573Srgrimes break; 11281573Srgrimes case REP(N, INF): /* as xx{n-1,INF} */ 11291573Srgrimes copy = dupl(p, start, finish); 11301573Srgrimes repeat(p, copy, from-1, to); 11311573Srgrimes break; 11321573Srgrimes default: /* "can't happen" */ 11331573Srgrimes SETERROR(REG_ASSERT); /* just in case */ 11341573Srgrimes break; 11351573Srgrimes } 11361573Srgrimes} 11371573Srgrimes 11381573Srgrimes/* 11391573Srgrimes - seterr - set an error condition 114092889Sobrien == static int seterr(struct parse *p, int e); 11411573Srgrimes */ 11421573Srgrimesstatic int /* useless but makes type checking happy */ 11431573Srgrimesseterr(p, e) 114492889Sobrienstruct parse *p; 11451573Srgrimesint e; 11461573Srgrimes{ 11471573Srgrimes if (p->error == 0) /* keep earliest error condition */ 11481573Srgrimes p->error = e; 11491573Srgrimes p->next = nuls; /* try to bring things to a halt */ 11501573Srgrimes p->end = nuls; 11511573Srgrimes return(0); /* make the return value well-defined */ 11521573Srgrimes} 11531573Srgrimes 11541573Srgrimes/* 11551573Srgrimes - allocset - allocate a set of characters for [] 115692889Sobrien == static cset *allocset(struct parse *p); 11571573Srgrimes */ 11581573Srgrimesstatic cset * 11591573Srgrimesallocset(p) 116092889Sobrienstruct parse *p; 11611573Srgrimes{ 116292889Sobrien int no = p->g->ncsets++; 116392889Sobrien size_t nc; 116492889Sobrien size_t nbytes; 116592889Sobrien cset *cs; 116692889Sobrien size_t css = (size_t)p->g->csetsize; 116792889Sobrien int i; 11681573Srgrimes 11691573Srgrimes if (no >= p->ncsalloc) { /* need another column of space */ 11701573Srgrimes p->ncsalloc += CHAR_BIT; 11711573Srgrimes nc = p->ncsalloc; 11721573Srgrimes assert(nc % CHAR_BIT == 0); 11731573Srgrimes nbytes = nc / CHAR_BIT * css; 11741573Srgrimes if (p->g->sets == NULL) 11751573Srgrimes p->g->sets = (cset *)malloc(nc * sizeof(cset)); 11761573Srgrimes else 117739327Simp p->g->sets = (cset *)reallocf((char *)p->g->sets, 11781573Srgrimes nc * sizeof(cset)); 11791573Srgrimes if (p->g->setbits == NULL) 11801573Srgrimes p->g->setbits = (uch *)malloc(nbytes); 11811573Srgrimes else { 118239327Simp p->g->setbits = (uch *)reallocf((char *)p->g->setbits, 11831573Srgrimes nbytes); 11841573Srgrimes /* xxx this isn't right if setbits is now NULL */ 11851573Srgrimes for (i = 0; i < no; i++) 11861573Srgrimes p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 11871573Srgrimes } 11881573Srgrimes if (p->g->sets != NULL && p->g->setbits != NULL) 11891573Srgrimes (void) memset((char *)p->g->setbits + (nbytes - css), 11901573Srgrimes 0, css); 11911573Srgrimes else { 11921573Srgrimes no = 0; 11931573Srgrimes SETERROR(REG_ESPACE); 11941573Srgrimes /* caller's responsibility not to do set ops */ 11951573Srgrimes } 11961573Srgrimes } 11971573Srgrimes 11981573Srgrimes assert(p->g->sets != NULL); /* xxx */ 11991573Srgrimes cs = &p->g->sets[no]; 12001573Srgrimes cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 12011573Srgrimes cs->mask = 1 << ((no) % CHAR_BIT); 12021573Srgrimes cs->hash = 0; 12031573Srgrimes 12041573Srgrimes return(cs); 12051573Srgrimes} 12061573Srgrimes 12071573Srgrimes/* 12081573Srgrimes - freeset - free a now-unused set 120992889Sobrien == static void freeset(struct parse *p, cset *cs); 12101573Srgrimes */ 12111573Srgrimesstatic void 12121573Srgrimesfreeset(p, cs) 121392889Sobrienstruct parse *p; 121492889Sobriencset *cs; 12151573Srgrimes{ 121692889Sobrien int i; 121792889Sobrien cset *top = &p->g->sets[p->g->ncsets]; 121892889Sobrien size_t css = (size_t)p->g->csetsize; 12191573Srgrimes 12201573Srgrimes for (i = 0; i < css; i++) 12211573Srgrimes CHsub(cs, i); 12221573Srgrimes if (cs == top-1) /* recover only the easy case */ 12231573Srgrimes p->g->ncsets--; 12241573Srgrimes} 12251573Srgrimes 12261573Srgrimes/* 12271573Srgrimes - freezeset - final processing on a set of characters 122892889Sobrien == static int freezeset(struct parse *p, cset *cs); 12291573Srgrimes * 12301573Srgrimes * The main task here is merging identical sets. This is usually a waste 12311573Srgrimes * of time (although the hash code minimizes the overhead), but can win 12321573Srgrimes * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 12331573Srgrimes * is done using addition rather than xor -- all ASCII [aA] sets xor to 12341573Srgrimes * the same value! 12351573Srgrimes */ 12361573Srgrimesstatic int /* set number */ 12371573Srgrimesfreezeset(p, cs) 123892889Sobrienstruct parse *p; 123992889Sobriencset *cs; 12401573Srgrimes{ 124192889Sobrien short h = cs->hash; 124292889Sobrien int i; 124392889Sobrien cset *top = &p->g->sets[p->g->ncsets]; 124492889Sobrien cset *cs2; 124592889Sobrien size_t css = (size_t)p->g->csetsize; 12461573Srgrimes 12471573Srgrimes /* look for an earlier one which is the same */ 12481573Srgrimes for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 12491573Srgrimes if (cs2->hash == h && cs2 != cs) { 12501573Srgrimes /* maybe */ 12511573Srgrimes for (i = 0; i < css; i++) 12521573Srgrimes if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 12531573Srgrimes break; /* no */ 12541573Srgrimes if (i == css) 12551573Srgrimes break; /* yes */ 12561573Srgrimes } 12571573Srgrimes 12581573Srgrimes if (cs2 < top) { /* found one */ 12591573Srgrimes freeset(p, cs); 12601573Srgrimes cs = cs2; 12611573Srgrimes } 12621573Srgrimes 12631573Srgrimes return((int)(cs - p->g->sets)); 12641573Srgrimes} 12651573Srgrimes 12661573Srgrimes/* 12671573Srgrimes - firstch - return first character in a set (which must have at least one) 126892889Sobrien == static int firstch(struct parse *p, cset *cs); 12691573Srgrimes */ 12701573Srgrimesstatic int /* character; there is no "none" value */ 12711573Srgrimesfirstch(p, cs) 127292889Sobrienstruct parse *p; 127392889Sobriencset *cs; 12741573Srgrimes{ 127592889Sobrien int i; 127692889Sobrien size_t css = (size_t)p->g->csetsize; 12771573Srgrimes 12781573Srgrimes for (i = 0; i < css; i++) 12791573Srgrimes if (CHIN(cs, i)) 128049094Sache return((char)i); 12811573Srgrimes assert(never); 12821573Srgrimes return(0); /* arbitrary */ 12831573Srgrimes} 12841573Srgrimes 12851573Srgrimes/* 12861573Srgrimes - nch - number of characters in a set 128792889Sobrien == static int nch(struct parse *p, cset *cs); 12881573Srgrimes */ 12891573Srgrimesstatic int 12901573Srgrimesnch(p, cs) 129192889Sobrienstruct parse *p; 129292889Sobriencset *cs; 12931573Srgrimes{ 129492889Sobrien int i; 129592889Sobrien size_t css = (size_t)p->g->csetsize; 129692889Sobrien int n = 0; 12971573Srgrimes 12981573Srgrimes for (i = 0; i < css; i++) 12991573Srgrimes if (CHIN(cs, i)) 13001573Srgrimes n++; 13011573Srgrimes return(n); 13021573Srgrimes} 13031573Srgrimes 13041573Srgrimes/* 13051573Srgrimes - dupl - emit a duplicate of a bunch of sops 130692889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish); 13071573Srgrimes */ 13081573Srgrimesstatic sopno /* start of duplicate */ 13091573Srgrimesdupl(p, start, finish) 131092889Sobrienstruct parse *p; 13111573Srgrimessopno start; /* from here */ 13121573Srgrimessopno finish; /* to this less one */ 13131573Srgrimes{ 131492889Sobrien sopno ret = HERE(); 131592889Sobrien sopno len = finish - start; 13161573Srgrimes 13171573Srgrimes assert(finish >= start); 13181573Srgrimes if (len == 0) 13191573Srgrimes return(ret); 13201573Srgrimes enlarge(p, p->ssize + len); /* this many unexpected additions */ 13211573Srgrimes assert(p->ssize >= p->slen + len); 13221573Srgrimes (void) memcpy((char *)(p->strip + p->slen), 13231573Srgrimes (char *)(p->strip + start), (size_t)len*sizeof(sop)); 13241573Srgrimes p->slen += len; 13251573Srgrimes return(ret); 13261573Srgrimes} 13271573Srgrimes 13281573Srgrimes/* 13291573Srgrimes - doemit - emit a strip operator 133092889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd); 13311573Srgrimes * 13321573Srgrimes * It might seem better to implement this as a macro with a function as 13331573Srgrimes * hard-case backup, but it's just too big and messy unless there are 13341573Srgrimes * some changes to the data structures. Maybe later. 13351573Srgrimes */ 13361573Srgrimesstatic void 13371573Srgrimesdoemit(p, op, opnd) 133892889Sobrienstruct parse *p; 13391573Srgrimessop op; 13401573Srgrimessize_t opnd; 13411573Srgrimes{ 13421573Srgrimes /* avoid making error situations worse */ 13431573Srgrimes if (p->error != 0) 13441573Srgrimes return; 13451573Srgrimes 13461573Srgrimes /* deal with oversize operands ("can't happen", more or less) */ 13471573Srgrimes assert(opnd < 1<<OPSHIFT); 13481573Srgrimes 13491573Srgrimes /* deal with undersized strip */ 13501573Srgrimes if (p->slen >= p->ssize) 13511573Srgrimes enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 13521573Srgrimes assert(p->slen < p->ssize); 13531573Srgrimes 13541573Srgrimes /* finally, it's all reduced to the easy case */ 13551573Srgrimes p->strip[p->slen++] = SOP(op, opnd); 13561573Srgrimes} 13571573Srgrimes 13581573Srgrimes/* 13591573Srgrimes - doinsert - insert a sop into the strip 136092889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 13611573Srgrimes */ 13621573Srgrimesstatic void 13631573Srgrimesdoinsert(p, op, opnd, pos) 136492889Sobrienstruct parse *p; 13651573Srgrimessop op; 13661573Srgrimessize_t opnd; 13671573Srgrimessopno pos; 13681573Srgrimes{ 136992889Sobrien sopno sn; 137092889Sobrien sop s; 137192889Sobrien int i; 13721573Srgrimes 13731573Srgrimes /* avoid making error situations worse */ 13741573Srgrimes if (p->error != 0) 13751573Srgrimes return; 13761573Srgrimes 13771573Srgrimes sn = HERE(); 13781573Srgrimes EMIT(op, opnd); /* do checks, ensure space */ 13791573Srgrimes assert(HERE() == sn+1); 13801573Srgrimes s = p->strip[sn]; 13811573Srgrimes 13821573Srgrimes /* adjust paren pointers */ 13831573Srgrimes assert(pos > 0); 13841573Srgrimes for (i = 1; i < NPAREN; i++) { 13851573Srgrimes if (p->pbegin[i] >= pos) { 13861573Srgrimes p->pbegin[i]++; 13871573Srgrimes } 13881573Srgrimes if (p->pend[i] >= pos) { 13891573Srgrimes p->pend[i]++; 13901573Srgrimes } 13911573Srgrimes } 13921573Srgrimes 13931573Srgrimes memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 13941573Srgrimes (HERE()-pos-1)*sizeof(sop)); 13951573Srgrimes p->strip[pos] = s; 13961573Srgrimes} 13971573Srgrimes 13981573Srgrimes/* 13991573Srgrimes - dofwd - complete a forward reference 140092889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value); 14011573Srgrimes */ 14021573Srgrimesstatic void 14031573Srgrimesdofwd(p, pos, value) 140492889Sobrienstruct parse *p; 140592889Sobriensopno pos; 14061573Srgrimessop value; 14071573Srgrimes{ 14081573Srgrimes /* avoid making error situations worse */ 14091573Srgrimes if (p->error != 0) 14101573Srgrimes return; 14111573Srgrimes 14121573Srgrimes assert(value < 1<<OPSHIFT); 14131573Srgrimes p->strip[pos] = OP(p->strip[pos]) | value; 14141573Srgrimes} 14151573Srgrimes 14161573Srgrimes/* 14171573Srgrimes - enlarge - enlarge the strip 141892889Sobrien == static void enlarge(struct parse *p, sopno size); 14191573Srgrimes */ 14201573Srgrimesstatic void 14211573Srgrimesenlarge(p, size) 142292889Sobrienstruct parse *p; 142392889Sobriensopno size; 14241573Srgrimes{ 142592889Sobrien sop *sp; 14261573Srgrimes 14271573Srgrimes if (p->ssize >= size) 14281573Srgrimes return; 14291573Srgrimes 14301573Srgrimes sp = (sop *)realloc(p->strip, size*sizeof(sop)); 14311573Srgrimes if (sp == NULL) { 14321573Srgrimes SETERROR(REG_ESPACE); 14331573Srgrimes return; 14341573Srgrimes } 14351573Srgrimes p->strip = sp; 14361573Srgrimes p->ssize = size; 14371573Srgrimes} 14381573Srgrimes 14391573Srgrimes/* 14401573Srgrimes - stripsnug - compact the strip 144192889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g); 14421573Srgrimes */ 14431573Srgrimesstatic void 14441573Srgrimesstripsnug(p, g) 144592889Sobrienstruct parse *p; 144692889Sobrienstruct re_guts *g; 14471573Srgrimes{ 14481573Srgrimes g->nstates = p->slen; 14491573Srgrimes g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 14501573Srgrimes if (g->strip == NULL) { 14511573Srgrimes SETERROR(REG_ESPACE); 14521573Srgrimes g->strip = p->strip; 14531573Srgrimes } 14541573Srgrimes} 14551573Srgrimes 14561573Srgrimes/* 14571573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string 145892889Sobrien == static void findmust(struct parse *p, struct re_guts *g); 14591573Srgrimes * 14601573Srgrimes * This algorithm could do fancy things like analyzing the operands of | 14611573Srgrimes * for common subsequences. Someday. This code is simple and finds most 14621573Srgrimes * of the interesting cases. 14631573Srgrimes * 14641573Srgrimes * Note that must and mlen got initialized during setup. 14651573Srgrimes */ 14661573Srgrimesstatic void 14671573Srgrimesfindmust(p, g) 14681573Srgrimesstruct parse *p; 146992889Sobrienstruct re_guts *g; 14701573Srgrimes{ 147192889Sobrien sop *scan; 14721573Srgrimes sop *start; 147392889Sobrien sop *newstart; 147492889Sobrien sopno newlen; 147592889Sobrien sop s; 147692889Sobrien char *cp; 147792889Sobrien sopno i; 147862391Sdcs int offset; 14791573Srgrimes 14801573Srgrimes /* avoid making error situations worse */ 14811573Srgrimes if (p->error != 0) 14821573Srgrimes return; 14831573Srgrimes 14841573Srgrimes /* find the longest OCHAR sequence in strip */ 14851573Srgrimes newlen = 0; 148662391Sdcs offset = 0; 148762391Sdcs g->moffset = 0; 14881573Srgrimes scan = g->strip + 1; 14891573Srgrimes do { 14901573Srgrimes s = *scan++; 14911573Srgrimes switch (OP(s)) { 14921573Srgrimes case OCHAR: /* sequence member */ 14931573Srgrimes if (newlen == 0) /* new sequence */ 14941573Srgrimes newstart = scan - 1; 14951573Srgrimes newlen++; 14961573Srgrimes break; 14971573Srgrimes case OPLUS_: /* things that don't break one */ 14981573Srgrimes case OLPAREN: 14991573Srgrimes case ORPAREN: 15001573Srgrimes break; 15011573Srgrimes case OQUEST_: /* things that must be skipped */ 15021573Srgrimes case OCH_: 1503131973Stjr offset = altoffset(scan, offset); 15041573Srgrimes scan--; 15051573Srgrimes do { 15061573Srgrimes scan += OPND(s); 15071573Srgrimes s = *scan; 15081573Srgrimes /* assert() interferes w debug printouts */ 15091573Srgrimes if (OP(s) != O_QUEST && OP(s) != O_CH && 15101573Srgrimes OP(s) != OOR2) { 15111573Srgrimes g->iflags |= BAD; 15121573Srgrimes return; 15131573Srgrimes } 15141573Srgrimes } while (OP(s) != O_QUEST && OP(s) != O_CH); 1515102411Scharnier /* FALLTHROUGH */ 151662391Sdcs case OBOW: /* things that break a sequence */ 151762391Sdcs case OEOW: 151862391Sdcs case OBOL: 151962391Sdcs case OEOL: 152062391Sdcs case O_QUEST: 152162391Sdcs case O_CH: 152262391Sdcs case OEND: 15231573Srgrimes if (newlen > g->mlen) { /* ends one */ 15241573Srgrimes start = newstart; 15251573Srgrimes g->mlen = newlen; 152662391Sdcs if (offset > -1) { 152762391Sdcs g->moffset += offset; 152862391Sdcs offset = newlen; 152962391Sdcs } else 153062391Sdcs g->moffset = offset; 153162391Sdcs } else { 153262391Sdcs if (offset > -1) 153362391Sdcs offset += newlen; 15341573Srgrimes } 15351573Srgrimes newlen = 0; 15361573Srgrimes break; 153762391Sdcs case OANY: 153862391Sdcs if (newlen > g->mlen) { /* ends one */ 153962391Sdcs start = newstart; 154062391Sdcs g->mlen = newlen; 154162391Sdcs if (offset > -1) { 154262391Sdcs g->moffset += offset; 154362391Sdcs offset = newlen; 154462391Sdcs } else 154562391Sdcs g->moffset = offset; 154662391Sdcs } else { 154762391Sdcs if (offset > -1) 154862391Sdcs offset += newlen; 154962391Sdcs } 155062391Sdcs if (offset > -1) 155162391Sdcs offset++; 155262391Sdcs newlen = 0; 155362391Sdcs break; 155462391Sdcs case OANYOF: /* may or may not invalidate offset */ 155562391Sdcs /* First, everything as OANY */ 155662391Sdcs if (newlen > g->mlen) { /* ends one */ 155762391Sdcs start = newstart; 155862391Sdcs g->mlen = newlen; 155962391Sdcs if (offset > -1) { 156062391Sdcs g->moffset += offset; 156162391Sdcs offset = newlen; 156262391Sdcs } else 156362391Sdcs g->moffset = offset; 156462391Sdcs } else { 156562391Sdcs if (offset > -1) 156662391Sdcs offset += newlen; 156762391Sdcs } 156862391Sdcs if (offset > -1) 156962391Sdcs offset++; 157062391Sdcs newlen = 0; 157162391Sdcs break; 157262391Sdcs default: 157362391Sdcs /* Anything here makes it impossible or too hard 157462391Sdcs * to calculate the offset -- so we give up; 157562391Sdcs * save the last known good offset, in case the 157662391Sdcs * must sequence doesn't occur later. 157762391Sdcs */ 157862391Sdcs if (newlen > g->mlen) { /* ends one */ 157962391Sdcs start = newstart; 158062391Sdcs g->mlen = newlen; 158162391Sdcs if (offset > -1) 158262391Sdcs g->moffset += offset; 158362391Sdcs else 158462391Sdcs g->moffset = offset; 158562391Sdcs } 158662391Sdcs offset = -1; 158762391Sdcs newlen = 0; 158862391Sdcs break; 15891573Srgrimes } 15901573Srgrimes } while (OP(s) != OEND); 15911573Srgrimes 159262391Sdcs if (g->mlen == 0) { /* there isn't one */ 159362391Sdcs g->moffset = -1; 15941573Srgrimes return; 159562391Sdcs } 15961573Srgrimes 15971573Srgrimes /* turn it into a character string */ 15981573Srgrimes g->must = malloc((size_t)g->mlen + 1); 15991573Srgrimes if (g->must == NULL) { /* argh; just forget it */ 16001573Srgrimes g->mlen = 0; 160162391Sdcs g->moffset = -1; 16021573Srgrimes return; 16031573Srgrimes } 16041573Srgrimes cp = g->must; 16051573Srgrimes scan = start; 16061573Srgrimes for (i = g->mlen; i > 0; i--) { 16071573Srgrimes while (OP(s = *scan++) != OCHAR) 16081573Srgrimes continue; 16091573Srgrimes assert(cp < g->must + g->mlen); 16101573Srgrimes *cp++ = (char)OPND(s); 16111573Srgrimes } 16121573Srgrimes assert(cp == g->must + g->mlen); 16131573Srgrimes *cp++ = '\0'; /* just on general principles */ 16141573Srgrimes} 16151573Srgrimes 16161573Srgrimes/* 161762391Sdcs - altoffset - choose biggest offset among multiple choices 1618131973Stjr == static int altoffset(sop *scan, int offset); 161962391Sdcs * 162062391Sdcs * Compute, recursively if necessary, the largest offset among multiple 162162391Sdcs * re paths. 162262391Sdcs */ 162362391Sdcsstatic int 1624131973Stjraltoffset(scan, offset) 162562391Sdcssop *scan; 162662391Sdcsint offset; 162762391Sdcs{ 162862391Sdcs int largest; 162962391Sdcs int try; 163062391Sdcs sop s; 163162391Sdcs 163262391Sdcs /* If we gave up already on offsets, return */ 163362391Sdcs if (offset == -1) 163462391Sdcs return -1; 163562391Sdcs 163662391Sdcs largest = 0; 163762391Sdcs try = 0; 163862391Sdcs s = *scan++; 163962391Sdcs while (OP(s) != O_QUEST && OP(s) != O_CH) { 164062391Sdcs switch (OP(s)) { 164162391Sdcs case OOR1: 164262391Sdcs if (try > largest) 164362391Sdcs largest = try; 164462391Sdcs try = 0; 164562391Sdcs break; 164662391Sdcs case OQUEST_: 164762391Sdcs case OCH_: 1648131973Stjr try = altoffset(scan, try); 164962391Sdcs if (try == -1) 165062391Sdcs return -1; 165162391Sdcs scan--; 165262391Sdcs do { 165362391Sdcs scan += OPND(s); 165462391Sdcs s = *scan; 165562391Sdcs if (OP(s) != O_QUEST && OP(s) != O_CH && 165662391Sdcs OP(s) != OOR2) 165762391Sdcs return -1; 165862391Sdcs } while (OP(s) != O_QUEST && OP(s) != O_CH); 165962855Sdcs /* We must skip to the next position, or we'll 166062855Sdcs * leave altoffset() too early. 166162855Sdcs */ 166262855Sdcs scan++; 166362391Sdcs break; 166462391Sdcs case OANYOF: 166562391Sdcs case OCHAR: 166662391Sdcs case OANY: 166762391Sdcs try++; 166862391Sdcs case OBOW: 166962391Sdcs case OEOW: 167062391Sdcs case OLPAREN: 167162391Sdcs case ORPAREN: 167262391Sdcs case OOR2: 167362391Sdcs break; 167462391Sdcs default: 167562391Sdcs try = -1; 167662391Sdcs break; 167762391Sdcs } 167862391Sdcs if (try == -1) 167962391Sdcs return -1; 168062391Sdcs s = *scan++; 168162391Sdcs } 168262391Sdcs 168362391Sdcs if (try > largest) 168462391Sdcs largest = try; 168562391Sdcs 168662391Sdcs return largest+offset; 168762391Sdcs} 168862391Sdcs 168962391Sdcs/* 169062232Sdcs - computejumps - compute char jumps for BM scan 169192889Sobrien == static void computejumps(struct parse *p, struct re_guts *g); 169262232Sdcs * 169362232Sdcs * This algorithm assumes g->must exists and is has size greater than 169462232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 169562232Sdcs * Sara Baase. 169662232Sdcs * 169762232Sdcs * A char jump is the number of characters one needs to jump based on 169862232Sdcs * the value of the character from the text that was mismatched. 169962232Sdcs */ 170062232Sdcsstatic void 170162232Sdcscomputejumps(p, g) 170262232Sdcsstruct parse *p; 170362232Sdcsstruct re_guts *g; 170462232Sdcs{ 170562232Sdcs int ch; 170662232Sdcs int mindex; 170762232Sdcs 170862232Sdcs /* Avoid making errors worse */ 170962232Sdcs if (p->error != 0) 171062232Sdcs return; 171162232Sdcs 171262848Sdcs g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 171362232Sdcs if (g->charjump == NULL) /* Not a fatal error */ 171462232Sdcs return; 171562754Sdcs /* Adjust for signed chars, if necessary */ 171662754Sdcs g->charjump = &g->charjump[-(CHAR_MIN)]; 171762232Sdcs 171862232Sdcs /* If the character does not exist in the pattern, the jump 171962232Sdcs * is equal to the number of characters in the pattern. 172062232Sdcs */ 172162754Sdcs for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 172262232Sdcs g->charjump[ch] = g->mlen; 172362232Sdcs 172462232Sdcs /* If the character does exist, compute the jump that would 172562232Sdcs * take us to the last character in the pattern equal to it 172662232Sdcs * (notice that we match right to left, so that last character 172762232Sdcs * is the first one that would be matched). 172862232Sdcs */ 172962232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 1730111010Snectar g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 173162232Sdcs} 173262232Sdcs 173362232Sdcs/* 173462232Sdcs - computematchjumps - compute match jumps for BM scan 173592889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g); 173662232Sdcs * 173762232Sdcs * This algorithm assumes g->must exists and is has size greater than 173862232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 173962232Sdcs * Sara Baase. 174062232Sdcs * 174162232Sdcs * A match jump is the number of characters one needs to advance based 174262232Sdcs * on the already-matched suffix. 174362232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way 174462232Sdcs * the search algorithm works. 174562232Sdcs */ 174662232Sdcsstatic void 174762232Sdcscomputematchjumps(p, g) 174862232Sdcsstruct parse *p; 174962232Sdcsstruct re_guts *g; 175062232Sdcs{ 175162232Sdcs int mindex; /* General "must" iterator */ 175262232Sdcs int suffix; /* Keeps track of matching suffix */ 175362232Sdcs int ssuffix; /* Keeps track of suffixes' suffix */ 175462232Sdcs int* pmatches; /* pmatches[k] points to the next i 175562232Sdcs * such that i+1...mlen is a substring 175662232Sdcs * of k+1...k+mlen-i-1 175762232Sdcs */ 175862232Sdcs 175962232Sdcs /* Avoid making errors worse */ 176062232Sdcs if (p->error != 0) 176162232Sdcs return; 176262232Sdcs 176362848Sdcs pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 176462232Sdcs if (pmatches == NULL) { 176562232Sdcs g->matchjump = NULL; 176662232Sdcs return; 176762232Sdcs } 176862232Sdcs 176962848Sdcs g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 177062232Sdcs if (g->matchjump == NULL) /* Not a fatal error */ 177162232Sdcs return; 177262232Sdcs 177362232Sdcs /* Set maximum possible jump for each character in the pattern */ 177462232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 177562232Sdcs g->matchjump[mindex] = 2*g->mlen - mindex - 1; 177662232Sdcs 177762232Sdcs /* Compute pmatches[] */ 177862232Sdcs for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 177962232Sdcs mindex--, suffix--) { 178062232Sdcs pmatches[mindex] = suffix; 178162232Sdcs 178262232Sdcs /* If a mismatch is found, interrupting the substring, 178362232Sdcs * compute the matchjump for that position. If no 178462232Sdcs * mismatch is found, then a text substring mismatched 178562232Sdcs * against the suffix will also mismatch against the 178662232Sdcs * substring. 178762232Sdcs */ 178862232Sdcs while (suffix < g->mlen 178962232Sdcs && g->must[mindex] != g->must[suffix]) { 179062232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 179162232Sdcs g->mlen - mindex - 1); 179262232Sdcs suffix = pmatches[suffix]; 179362232Sdcs } 179462232Sdcs } 179562232Sdcs 179662232Sdcs /* Compute the matchjump up to the last substring found to jump 179762232Sdcs * to the beginning of the largest must pattern prefix matching 179862232Sdcs * it's own suffix. 179962232Sdcs */ 180062232Sdcs for (mindex = 0; mindex <= suffix; mindex++) 180162232Sdcs g->matchjump[mindex] = MIN(g->matchjump[mindex], 180262232Sdcs g->mlen + suffix - mindex); 180362232Sdcs 180462232Sdcs ssuffix = pmatches[suffix]; 180562391Sdcs while (suffix < g->mlen) { 180662673Sdcs while (suffix <= ssuffix && suffix < g->mlen) { 180762232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 180862232Sdcs g->mlen + ssuffix - suffix); 180962232Sdcs suffix++; 181062232Sdcs } 181186208Sdcs if (suffix < g->mlen) 181286208Sdcs ssuffix = pmatches[ssuffix]; 181362232Sdcs } 181462232Sdcs 181562232Sdcs free(pmatches); 181662232Sdcs} 181762232Sdcs 181862232Sdcs/* 18191573Srgrimes - pluscount - count + nesting 182092889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g); 18211573Srgrimes */ 18221573Srgrimesstatic sopno /* nesting depth */ 18231573Srgrimespluscount(p, g) 18241573Srgrimesstruct parse *p; 182592889Sobrienstruct re_guts *g; 18261573Srgrimes{ 182792889Sobrien sop *scan; 182892889Sobrien sop s; 182992889Sobrien sopno plusnest = 0; 183092889Sobrien sopno maxnest = 0; 18311573Srgrimes 18321573Srgrimes if (p->error != 0) 18331573Srgrimes return(0); /* there may not be an OEND */ 18341573Srgrimes 18351573Srgrimes scan = g->strip + 1; 18361573Srgrimes do { 18371573Srgrimes s = *scan++; 18381573Srgrimes switch (OP(s)) { 18391573Srgrimes case OPLUS_: 18401573Srgrimes plusnest++; 18411573Srgrimes break; 18421573Srgrimes case O_PLUS: 18431573Srgrimes if (plusnest > maxnest) 18441573Srgrimes maxnest = plusnest; 18451573Srgrimes plusnest--; 18461573Srgrimes break; 18471573Srgrimes } 18481573Srgrimes } while (OP(s) != OEND); 18491573Srgrimes if (plusnest != 0) 18501573Srgrimes g->iflags |= BAD; 18511573Srgrimes return(maxnest); 18521573Srgrimes} 1853