regcomp.c revision 92889
11573Srgrimes/*- 21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer. 31573Srgrimes * Copyright (c) 1992, 1993, 1994 41573Srgrimes * The Regents of the University of California. All rights reserved. 51573Srgrimes * 61573Srgrimes * This code is derived from software contributed to Berkeley by 71573Srgrimes * Henry Spencer. 81573Srgrimes * 91573Srgrimes * Redistribution and use in source and binary forms, with or without 101573Srgrimes * modification, are permitted provided that the following conditions 111573Srgrimes * are met: 121573Srgrimes * 1. Redistributions of source code must retain the above copyright 131573Srgrimes * notice, this list of conditions and the following disclaimer. 141573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 151573Srgrimes * notice, this list of conditions and the following disclaimer in the 161573Srgrimes * documentation and/or other materials provided with the distribution. 171573Srgrimes * 3. All advertising materials mentioning features or use of this software 181573Srgrimes * must display the following acknowledgement: 191573Srgrimes * This product includes software developed by the University of 201573Srgrimes * California, Berkeley and its contributors. 211573Srgrimes * 4. Neither the name of the University nor the names of its contributors 221573Srgrimes * may be used to endorse or promote products derived from this software 231573Srgrimes * without specific prior written permission. 241573Srgrimes * 251573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 261573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 271573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 281573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 291573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 301573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 311573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 321573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 331573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 341573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 351573Srgrimes * SUCH DAMAGE. 361573Srgrimes * 371573Srgrimes * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 3862232Sdcs * 3962232Sdcs * $FreeBSD: head/lib/libc/regex/regcomp.c 92889 2002-03-21 18:49:23Z obrien $ 401573Srgrimes */ 411573Srgrimes 421573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 431573Srgrimesstatic char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 441573Srgrimes#endif /* LIBC_SCCS and not lint */ 451573Srgrimes 461573Srgrimes#include <sys/types.h> 471573Srgrimes#include <stdio.h> 481573Srgrimes#include <string.h> 491573Srgrimes#include <ctype.h> 501573Srgrimes#include <limits.h> 511573Srgrimes#include <stdlib.h> 521573Srgrimes#include <regex.h> 531573Srgrimes 5419277Sache#include "collate.h" 5519277Sache 561573Srgrimes#include "utils.h" 571573Srgrimes#include "regex2.h" 581573Srgrimes 591573Srgrimes#include "cclass.h" 601573Srgrimes#include "cname.h" 611573Srgrimes 621573Srgrimes/* 631573Srgrimes * parse structure, passed up and down to avoid global variables and 641573Srgrimes * other clumsinesses 651573Srgrimes */ 661573Srgrimesstruct parse { 671573Srgrimes char *next; /* next character in RE */ 681573Srgrimes char *end; /* end of string (-> NUL normally) */ 691573Srgrimes int error; /* has an error been seen? */ 701573Srgrimes sop *strip; /* malloced strip */ 711573Srgrimes sopno ssize; /* malloced strip size (allocated) */ 721573Srgrimes sopno slen; /* malloced strip length (used) */ 731573Srgrimes int ncsalloc; /* number of csets allocated */ 741573Srgrimes struct re_guts *g; 751573Srgrimes# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 761573Srgrimes sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 771573Srgrimes sopno pend[NPAREN]; /* -> ) ([0] unused) */ 781573Srgrimes}; 791573Srgrimes 801573Srgrimes/* ========= begin header generated by ./mkh ========= */ 811573Srgrimes#ifdef __cplusplus 821573Srgrimesextern "C" { 831573Srgrimes#endif 841573Srgrimes 851573Srgrimes/* === regcomp.c === */ 861573Srgrimesstatic void p_ere __P((struct parse *p, int stop)); 871573Srgrimesstatic void p_ere_exp __P((struct parse *p)); 881573Srgrimesstatic void p_str __P((struct parse *p)); 891573Srgrimesstatic void p_bre __P((struct parse *p, int end1, int end2)); 901573Srgrimesstatic int p_simp_re __P((struct parse *p, int starordinary)); 911573Srgrimesstatic int p_count __P((struct parse *p)); 921573Srgrimesstatic void p_bracket __P((struct parse *p)); 931573Srgrimesstatic void p_b_term __P((struct parse *p, cset *cs)); 941573Srgrimesstatic void p_b_cclass __P((struct parse *p, cset *cs)); 951573Srgrimesstatic void p_b_eclass __P((struct parse *p, cset *cs)); 961573Srgrimesstatic char p_b_symbol __P((struct parse *p)); 971573Srgrimesstatic char p_b_coll_elem __P((struct parse *p, int endc)); 981573Srgrimesstatic char othercase __P((int ch)); 991573Srgrimesstatic void bothcases __P((struct parse *p, int ch)); 1001573Srgrimesstatic void ordinary __P((struct parse *p, int ch)); 1011573Srgrimesstatic void nonnewline __P((struct parse *p)); 1021573Srgrimesstatic void repeat __P((struct parse *p, sopno start, int from, int to)); 1031573Srgrimesstatic int seterr __P((struct parse *p, int e)); 1041573Srgrimesstatic cset *allocset __P((struct parse *p)); 1051573Srgrimesstatic void freeset __P((struct parse *p, cset *cs)); 1061573Srgrimesstatic int freezeset __P((struct parse *p, cset *cs)); 1071573Srgrimesstatic int firstch __P((struct parse *p, cset *cs)); 1081573Srgrimesstatic int nch __P((struct parse *p, cset *cs)); 1091573Srgrimesstatic void mcadd __P((struct parse *p, cset *cs, char *cp)); 11017141Sjkh#if used 1111573Srgrimesstatic void mcsub __P((cset *cs, char *cp)); 1121573Srgrimesstatic int mcin __P((cset *cs, char *cp)); 1131573Srgrimesstatic char *mcfind __P((cset *cs, char *cp)); 11417141Sjkh#endif 1151573Srgrimesstatic void mcinvert __P((struct parse *p, cset *cs)); 1161573Srgrimesstatic void mccase __P((struct parse *p, cset *cs)); 1171573Srgrimesstatic int isinsets __P((struct re_guts *g, int c)); 1181573Srgrimesstatic int samesets __P((struct re_guts *g, int c1, int c2)); 1191573Srgrimesstatic void categorize __P((struct parse *p, struct re_guts *g)); 1201573Srgrimesstatic sopno dupl __P((struct parse *p, sopno start, sopno finish)); 1211573Srgrimesstatic void doemit __P((struct parse *p, sop op, size_t opnd)); 1221573Srgrimesstatic void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 1231573Srgrimesstatic void dofwd __P((struct parse *p, sopno pos, sop value)); 1241573Srgrimesstatic void enlarge __P((struct parse *p, sopno size)); 1251573Srgrimesstatic void stripsnug __P((struct parse *p, struct re_guts *g)); 1261573Srgrimesstatic void findmust __P((struct parse *p, struct re_guts *g)); 12762391Sdcsstatic int altoffset __P((sop *scan, int offset, int mccs)); 12862232Sdcsstatic void computejumps __P((struct parse *p, struct re_guts *g)); 12962232Sdcsstatic void computematchjumps __P((struct parse *p, struct re_guts *g)); 1301573Srgrimesstatic sopno pluscount __P((struct parse *p, struct re_guts *g)); 1311573Srgrimes 1321573Srgrimes#ifdef __cplusplus 1331573Srgrimes} 1341573Srgrimes#endif 1351573Srgrimes/* ========= end header generated by ./mkh ========= */ 1361573Srgrimes 1371573Srgrimesstatic char nuls[10]; /* place to point scanner in event of error */ 1381573Srgrimes 1391573Srgrimes/* 1401573Srgrimes * macros for use with parse structure 1411573Srgrimes * BEWARE: these know that the parse structure is named `p' !!! 1421573Srgrimes */ 1431573Srgrimes#define PEEK() (*p->next) 1441573Srgrimes#define PEEK2() (*(p->next+1)) 1451573Srgrimes#define MORE() (p->next < p->end) 1461573Srgrimes#define MORE2() (p->next+1 < p->end) 1471573Srgrimes#define SEE(c) (MORE() && PEEK() == (c)) 1481573Srgrimes#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 1491573Srgrimes#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 1501573Srgrimes#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 1511573Srgrimes#define NEXT() (p->next++) 1521573Srgrimes#define NEXT2() (p->next += 2) 1531573Srgrimes#define NEXTn(n) (p->next += (n)) 1541573Srgrimes#define GETNEXT() (*p->next++) 1551573Srgrimes#define SETERROR(e) seterr(p, (e)) 1561573Srgrimes#define REQUIRE(co, e) ((co) || SETERROR(e)) 1571573Srgrimes#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 1581573Srgrimes#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 1591573Srgrimes#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 1601573Srgrimes#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 1611573Srgrimes#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 1621573Srgrimes#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 1631573Srgrimes#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 1641573Srgrimes#define HERE() (p->slen) 1651573Srgrimes#define THERE() (p->slen - 1) 1661573Srgrimes#define THERETHERE() (p->slen - 2) 1671573Srgrimes#define DROP(n) (p->slen -= (n)) 1681573Srgrimes 1691573Srgrimes#ifndef NDEBUG 1701573Srgrimesstatic int never = 0; /* for use in asserts; shuts lint up */ 1711573Srgrimes#else 1721573Srgrimes#define never 0 /* some <assert.h>s have bugs too */ 1731573Srgrimes#endif 1741573Srgrimes 17562232Sdcs/* Macro used by computejump()/computematchjump() */ 17662232Sdcs#define MIN(a,b) ((a)<(b)?(a):(b)) 17762232Sdcs 1781573Srgrimes/* 1791573Srgrimes - regcomp - interface for parser and compilation 1801573Srgrimes = extern int regcomp(regex_t *, const char *, int); 1811573Srgrimes = #define REG_BASIC 0000 1821573Srgrimes = #define REG_EXTENDED 0001 1831573Srgrimes = #define REG_ICASE 0002 1841573Srgrimes = #define REG_NOSUB 0004 1851573Srgrimes = #define REG_NEWLINE 0010 1861573Srgrimes = #define REG_NOSPEC 0020 1871573Srgrimes = #define REG_PEND 0040 1881573Srgrimes = #define REG_DUMP 0200 1891573Srgrimes */ 1901573Srgrimesint /* 0 success, otherwise REG_something */ 1911573Srgrimesregcomp(preg, pattern, cflags) 1921573Srgrimesregex_t *preg; 1931573Srgrimesconst char *pattern; 1941573Srgrimesint cflags; 1951573Srgrimes{ 1961573Srgrimes struct parse pa; 19792889Sobrien struct re_guts *g; 19892889Sobrien struct parse *p = &pa; 19992889Sobrien int i; 20092889Sobrien size_t len; 2011573Srgrimes#ifdef REDEBUG 2021573Srgrimes# define GOODFLAGS(f) (f) 2031573Srgrimes#else 2041573Srgrimes# define GOODFLAGS(f) ((f)&~REG_DUMP) 2051573Srgrimes#endif 2061573Srgrimes 2071573Srgrimes cflags = GOODFLAGS(cflags); 2081573Srgrimes if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 2091573Srgrimes return(REG_INVARG); 2101573Srgrimes 2111573Srgrimes if (cflags®_PEND) { 2121573Srgrimes if (preg->re_endp < pattern) 2131573Srgrimes return(REG_INVARG); 2141573Srgrimes len = preg->re_endp - pattern; 2151573Srgrimes } else 2161573Srgrimes len = strlen((char *)pattern); 2171573Srgrimes 2181573Srgrimes /* do the mallocs early so failure handling is easy */ 2191573Srgrimes g = (struct re_guts *)malloc(sizeof(struct re_guts) + 2201573Srgrimes (NC-1)*sizeof(cat_t)); 2211573Srgrimes if (g == NULL) 2221573Srgrimes return(REG_ESPACE); 2231573Srgrimes p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 2241573Srgrimes p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 2251573Srgrimes p->slen = 0; 2261573Srgrimes if (p->strip == NULL) { 2271573Srgrimes free((char *)g); 2281573Srgrimes return(REG_ESPACE); 2291573Srgrimes } 2301573Srgrimes 2311573Srgrimes /* set things up */ 2321573Srgrimes p->g = g; 2331573Srgrimes p->next = (char *)pattern; /* convenience; we do not modify it */ 2341573Srgrimes p->end = p->next + len; 2351573Srgrimes p->error = 0; 2361573Srgrimes p->ncsalloc = 0; 2371573Srgrimes for (i = 0; i < NPAREN; i++) { 2381573Srgrimes p->pbegin[i] = 0; 2391573Srgrimes p->pend[i] = 0; 2401573Srgrimes } 2411573Srgrimes g->csetsize = NC; 2421573Srgrimes g->sets = NULL; 2431573Srgrimes g->setbits = NULL; 2441573Srgrimes g->ncsets = 0; 2451573Srgrimes g->cflags = cflags; 2461573Srgrimes g->iflags = 0; 2471573Srgrimes g->nbol = 0; 2481573Srgrimes g->neol = 0; 2491573Srgrimes g->must = NULL; 25062391Sdcs g->moffset = -1; 25162263Sdcs g->charjump = NULL; 25262263Sdcs g->matchjump = NULL; 2531573Srgrimes g->mlen = 0; 2541573Srgrimes g->nsub = 0; 2551573Srgrimes g->ncategories = 1; /* category 0 is "everything else" */ 2561573Srgrimes g->categories = &g->catspace[-(CHAR_MIN)]; 2571573Srgrimes (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 2581573Srgrimes g->backrefs = 0; 2591573Srgrimes 2601573Srgrimes /* do it */ 2611573Srgrimes EMIT(OEND, 0); 2621573Srgrimes g->firststate = THERE(); 2631573Srgrimes if (cflags®_EXTENDED) 2641573Srgrimes p_ere(p, OUT); 2651573Srgrimes else if (cflags®_NOSPEC) 2661573Srgrimes p_str(p); 2671573Srgrimes else 2681573Srgrimes p_bre(p, OUT, OUT); 2691573Srgrimes EMIT(OEND, 0); 2701573Srgrimes g->laststate = THERE(); 2711573Srgrimes 2721573Srgrimes /* tidy up loose ends and fill things in */ 2731573Srgrimes categorize(p, g); 2741573Srgrimes stripsnug(p, g); 2751573Srgrimes findmust(p, g); 27662232Sdcs /* only use Boyer-Moore algorithm if the pattern is bigger 27762232Sdcs * than three characters 27862232Sdcs */ 27962232Sdcs if(g->mlen > 3) { 28062232Sdcs computejumps(p, g); 28162232Sdcs computematchjumps(p, g); 28262755Sdcs if(g->matchjump == NULL && g->charjump != NULL) { 28362232Sdcs free(g->charjump); 28462232Sdcs g->charjump = NULL; 28562232Sdcs } 28662232Sdcs } 2871573Srgrimes g->nplus = pluscount(p, g); 2881573Srgrimes g->magic = MAGIC2; 2891573Srgrimes preg->re_nsub = g->nsub; 2901573Srgrimes preg->re_g = g; 2911573Srgrimes preg->re_magic = MAGIC1; 2921573Srgrimes#ifndef REDEBUG 2931573Srgrimes /* not debugging, so can't rely on the assert() in regexec() */ 2941573Srgrimes if (g->iflags&BAD) 2951573Srgrimes SETERROR(REG_ASSERT); 2961573Srgrimes#endif 2971573Srgrimes 2981573Srgrimes /* win or lose, we're done */ 2991573Srgrimes if (p->error != 0) /* lose */ 3001573Srgrimes regfree(preg); 3011573Srgrimes return(p->error); 3021573Srgrimes} 3031573Srgrimes 3041573Srgrimes/* 3051573Srgrimes - p_ere - ERE parser top level, concatenation and alternation 30692889Sobrien == static void p_ere(struct parse *p, int stop); 3071573Srgrimes */ 3081573Srgrimesstatic void 3091573Srgrimesp_ere(p, stop) 31092889Sobrienstruct parse *p; 3111573Srgrimesint stop; /* character this ERE should end at */ 3121573Srgrimes{ 31392889Sobrien char c; 31492889Sobrien sopno prevback; 31592889Sobrien sopno prevfwd; 31692889Sobrien sopno conc; 31792889Sobrien int first = 1; /* is this the first alternative? */ 3181573Srgrimes 3191573Srgrimes for (;;) { 3201573Srgrimes /* do a bunch of concatenated expressions */ 3211573Srgrimes conc = HERE(); 3221573Srgrimes while (MORE() && (c = PEEK()) != '|' && c != stop) 3231573Srgrimes p_ere_exp(p); 32417141Sjkh (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 3251573Srgrimes 3261573Srgrimes if (!EAT('|')) 3271573Srgrimes break; /* NOTE BREAK OUT */ 3281573Srgrimes 3291573Srgrimes if (first) { 3301573Srgrimes INSERT(OCH_, conc); /* offset is wrong */ 3311573Srgrimes prevfwd = conc; 3321573Srgrimes prevback = conc; 3331573Srgrimes first = 0; 3341573Srgrimes } 3351573Srgrimes ASTERN(OOR1, prevback); 3361573Srgrimes prevback = THERE(); 3371573Srgrimes AHEAD(prevfwd); /* fix previous offset */ 3381573Srgrimes prevfwd = HERE(); 3391573Srgrimes EMIT(OOR2, 0); /* offset is very wrong */ 3401573Srgrimes } 3411573Srgrimes 3421573Srgrimes if (!first) { /* tail-end fixups */ 3431573Srgrimes AHEAD(prevfwd); 3441573Srgrimes ASTERN(O_CH, prevback); 3451573Srgrimes } 3461573Srgrimes 3471573Srgrimes assert(!MORE() || SEE(stop)); 3481573Srgrimes} 3491573Srgrimes 3501573Srgrimes/* 3511573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 35292889Sobrien == static void p_ere_exp(struct parse *p); 3531573Srgrimes */ 3541573Srgrimesstatic void 3551573Srgrimesp_ere_exp(p) 35692889Sobrienstruct parse *p; 3571573Srgrimes{ 35892889Sobrien char c; 35992889Sobrien sopno pos; 36092889Sobrien int count; 36192889Sobrien int count2; 36292889Sobrien sopno subno; 3631573Srgrimes int wascaret = 0; 3641573Srgrimes 3651573Srgrimes assert(MORE()); /* caller should have ensured this */ 3661573Srgrimes c = GETNEXT(); 3671573Srgrimes 3681573Srgrimes pos = HERE(); 3691573Srgrimes switch (c) { 3701573Srgrimes case '(': 37117141Sjkh (void)REQUIRE(MORE(), REG_EPAREN); 3721573Srgrimes p->g->nsub++; 3731573Srgrimes subno = p->g->nsub; 3741573Srgrimes if (subno < NPAREN) 3751573Srgrimes p->pbegin[subno] = HERE(); 3761573Srgrimes EMIT(OLPAREN, subno); 3771573Srgrimes if (!SEE(')')) 3781573Srgrimes p_ere(p, ')'); 3791573Srgrimes if (subno < NPAREN) { 3801573Srgrimes p->pend[subno] = HERE(); 3811573Srgrimes assert(p->pend[subno] != 0); 3821573Srgrimes } 3831573Srgrimes EMIT(ORPAREN, subno); 38417141Sjkh (void)MUSTEAT(')', REG_EPAREN); 3851573Srgrimes break; 3861573Srgrimes#ifndef POSIX_MISTAKE 3871573Srgrimes case ')': /* happens only if no current unmatched ( */ 3881573Srgrimes /* 3891573Srgrimes * You may ask, why the ifndef? Because I didn't notice 3901573Srgrimes * this until slightly too late for 1003.2, and none of the 3911573Srgrimes * other 1003.2 regular-expression reviewers noticed it at 3921573Srgrimes * all. So an unmatched ) is legal POSIX, at least until 3931573Srgrimes * we can get it fixed. 3941573Srgrimes */ 3951573Srgrimes SETERROR(REG_EPAREN); 3961573Srgrimes break; 3971573Srgrimes#endif 3981573Srgrimes case '^': 3991573Srgrimes EMIT(OBOL, 0); 4001573Srgrimes p->g->iflags |= USEBOL; 4011573Srgrimes p->g->nbol++; 4021573Srgrimes wascaret = 1; 4031573Srgrimes break; 4041573Srgrimes case '$': 4051573Srgrimes EMIT(OEOL, 0); 4061573Srgrimes p->g->iflags |= USEEOL; 4071573Srgrimes p->g->neol++; 4081573Srgrimes break; 4091573Srgrimes case '|': 4101573Srgrimes SETERROR(REG_EMPTY); 4111573Srgrimes break; 4121573Srgrimes case '*': 4131573Srgrimes case '+': 4141573Srgrimes case '?': 4151573Srgrimes SETERROR(REG_BADRPT); 4161573Srgrimes break; 4171573Srgrimes case '.': 4181573Srgrimes if (p->g->cflags®_NEWLINE) 4191573Srgrimes nonnewline(p); 4201573Srgrimes else 4211573Srgrimes EMIT(OANY, 0); 4221573Srgrimes break; 4231573Srgrimes case '[': 4241573Srgrimes p_bracket(p); 4251573Srgrimes break; 4261573Srgrimes case '\\': 42717141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 4281573Srgrimes c = GETNEXT(); 4291573Srgrimes ordinary(p, c); 4301573Srgrimes break; 4311573Srgrimes case '{': /* okay as ordinary except if digit follows */ 43249094Sache (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 4331573Srgrimes /* FALLTHROUGH */ 4341573Srgrimes default: 4351573Srgrimes ordinary(p, c); 4361573Srgrimes break; 4371573Srgrimes } 4381573Srgrimes 4391573Srgrimes if (!MORE()) 4401573Srgrimes return; 4411573Srgrimes c = PEEK(); 4421573Srgrimes /* we call { a repetition if followed by a digit */ 4431573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 44449094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 4451573Srgrimes return; /* no repetition, we're done */ 4461573Srgrimes NEXT(); 4471573Srgrimes 44817141Sjkh (void)REQUIRE(!wascaret, REG_BADRPT); 4491573Srgrimes switch (c) { 4501573Srgrimes case '*': /* implemented as +? */ 4511573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 4521573Srgrimes INSERT(OPLUS_, pos); 4531573Srgrimes ASTERN(O_PLUS, pos); 4541573Srgrimes INSERT(OQUEST_, pos); 4551573Srgrimes ASTERN(O_QUEST, pos); 4561573Srgrimes break; 4571573Srgrimes case '+': 4581573Srgrimes INSERT(OPLUS_, pos); 4591573Srgrimes ASTERN(O_PLUS, pos); 4601573Srgrimes break; 4611573Srgrimes case '?': 4621573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 4631573Srgrimes INSERT(OCH_, pos); /* offset slightly wrong */ 4641573Srgrimes ASTERN(OOR1, pos); /* this one's right */ 4651573Srgrimes AHEAD(pos); /* fix the OCH_ */ 4661573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 4671573Srgrimes AHEAD(THERE()); /* ...so fix it */ 4681573Srgrimes ASTERN(O_CH, THERETHERE()); 4691573Srgrimes break; 4701573Srgrimes case '{': 4711573Srgrimes count = p_count(p); 4721573Srgrimes if (EAT(',')) { 47349094Sache if (isdigit((uch)PEEK())) { 4741573Srgrimes count2 = p_count(p); 47517141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 4761573Srgrimes } else /* single number with comma */ 4771573Srgrimes count2 = INFINITY; 4781573Srgrimes } else /* just a single number */ 4791573Srgrimes count2 = count; 4801573Srgrimes repeat(p, pos, count, count2); 4811573Srgrimes if (!EAT('}')) { /* error heuristics */ 4821573Srgrimes while (MORE() && PEEK() != '}') 4831573Srgrimes NEXT(); 48417141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 4851573Srgrimes SETERROR(REG_BADBR); 4861573Srgrimes } 4871573Srgrimes break; 4881573Srgrimes } 4891573Srgrimes 4901573Srgrimes if (!MORE()) 4911573Srgrimes return; 4921573Srgrimes c = PEEK(); 4931573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 49449094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 4951573Srgrimes return; 4961573Srgrimes SETERROR(REG_BADRPT); 4971573Srgrimes} 4981573Srgrimes 4991573Srgrimes/* 5001573Srgrimes - p_str - string (no metacharacters) "parser" 50192889Sobrien == static void p_str(struct parse *p); 5021573Srgrimes */ 5031573Srgrimesstatic void 5041573Srgrimesp_str(p) 50592889Sobrienstruct parse *p; 5061573Srgrimes{ 50717141Sjkh (void)REQUIRE(MORE(), REG_EMPTY); 5081573Srgrimes while (MORE()) 5091573Srgrimes ordinary(p, GETNEXT()); 5101573Srgrimes} 5111573Srgrimes 5121573Srgrimes/* 5131573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation 51492889Sobrien == static void p_bre(struct parse *p, int end1, \ 51592889Sobrien == int end2); 5161573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check. 5171573Srgrimes * 5181573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first 5191573Srgrimes * taken as an ordinary character and then revised to be an anchor. The 5201573Srgrimes * only undesirable side effect is that '$' gets included as a character 5211573Srgrimes * category in such cases. This is fairly harmless; not worth fixing. 5221573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive. 5231573Srgrimes */ 5241573Srgrimesstatic void 5251573Srgrimesp_bre(p, end1, end2) 52692889Sobrienstruct parse *p; 52792889Sobrienint end1; /* first terminating character */ 52892889Sobrienint end2; /* second terminating character */ 5291573Srgrimes{ 53092889Sobrien sopno start = HERE(); 53192889Sobrien int first = 1; /* first subexpression? */ 53292889Sobrien int wasdollar = 0; 5331573Srgrimes 5341573Srgrimes if (EAT('^')) { 5351573Srgrimes EMIT(OBOL, 0); 5361573Srgrimes p->g->iflags |= USEBOL; 5371573Srgrimes p->g->nbol++; 5381573Srgrimes } 5391573Srgrimes while (MORE() && !SEETWO(end1, end2)) { 5401573Srgrimes wasdollar = p_simp_re(p, first); 5411573Srgrimes first = 0; 5421573Srgrimes } 5431573Srgrimes if (wasdollar) { /* oops, that was a trailing anchor */ 5441573Srgrimes DROP(1); 5451573Srgrimes EMIT(OEOL, 0); 5461573Srgrimes p->g->iflags |= USEEOL; 5471573Srgrimes p->g->neol++; 5481573Srgrimes } 5491573Srgrimes 55017141Sjkh (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 5511573Srgrimes} 5521573Srgrimes 5531573Srgrimes/* 5541573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 55592889Sobrien == static int p_simp_re(struct parse *p, int starordinary); 5561573Srgrimes */ 5571573Srgrimesstatic int /* was the simple RE an unbackslashed $? */ 5581573Srgrimesp_simp_re(p, starordinary) 55992889Sobrienstruct parse *p; 5601573Srgrimesint starordinary; /* is a leading * an ordinary character? */ 5611573Srgrimes{ 56292889Sobrien int c; 56392889Sobrien int count; 56492889Sobrien int count2; 56592889Sobrien sopno pos; 56692889Sobrien int i; 56792889Sobrien sopno subno; 5681573Srgrimes# define BACKSL (1<<CHAR_BIT) 5691573Srgrimes 5701573Srgrimes pos = HERE(); /* repetion op, if any, covers from here */ 5711573Srgrimes 5721573Srgrimes assert(MORE()); /* caller should have ensured this */ 5731573Srgrimes c = GETNEXT(); 5741573Srgrimes if (c == '\\') { 57517141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 57649094Sache c = BACKSL | GETNEXT(); 5771573Srgrimes } 5781573Srgrimes switch (c) { 5791573Srgrimes case '.': 5801573Srgrimes if (p->g->cflags®_NEWLINE) 5811573Srgrimes nonnewline(p); 5821573Srgrimes else 5831573Srgrimes EMIT(OANY, 0); 5841573Srgrimes break; 5851573Srgrimes case '[': 5861573Srgrimes p_bracket(p); 5871573Srgrimes break; 5881573Srgrimes case BACKSL|'{': 5891573Srgrimes SETERROR(REG_BADRPT); 5901573Srgrimes break; 5911573Srgrimes case BACKSL|'(': 5921573Srgrimes p->g->nsub++; 5931573Srgrimes subno = p->g->nsub; 5941573Srgrimes if (subno < NPAREN) 5951573Srgrimes p->pbegin[subno] = HERE(); 5961573Srgrimes EMIT(OLPAREN, subno); 5971573Srgrimes /* the MORE here is an error heuristic */ 5981573Srgrimes if (MORE() && !SEETWO('\\', ')')) 5991573Srgrimes p_bre(p, '\\', ')'); 6001573Srgrimes if (subno < NPAREN) { 6011573Srgrimes p->pend[subno] = HERE(); 6021573Srgrimes assert(p->pend[subno] != 0); 6031573Srgrimes } 6041573Srgrimes EMIT(ORPAREN, subno); 60517141Sjkh (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 6061573Srgrimes break; 6071573Srgrimes case BACKSL|')': /* should not get here -- must be user */ 6081573Srgrimes case BACKSL|'}': 6091573Srgrimes SETERROR(REG_EPAREN); 6101573Srgrimes break; 6111573Srgrimes case BACKSL|'1': 6121573Srgrimes case BACKSL|'2': 6131573Srgrimes case BACKSL|'3': 6141573Srgrimes case BACKSL|'4': 6151573Srgrimes case BACKSL|'5': 6161573Srgrimes case BACKSL|'6': 6171573Srgrimes case BACKSL|'7': 6181573Srgrimes case BACKSL|'8': 6191573Srgrimes case BACKSL|'9': 6201573Srgrimes i = (c&~BACKSL) - '0'; 6211573Srgrimes assert(i < NPAREN); 6221573Srgrimes if (p->pend[i] != 0) { 6231573Srgrimes assert(i <= p->g->nsub); 6241573Srgrimes EMIT(OBACK_, i); 6251573Srgrimes assert(p->pbegin[i] != 0); 6261573Srgrimes assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 6271573Srgrimes assert(OP(p->strip[p->pend[i]]) == ORPAREN); 6281573Srgrimes (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 6291573Srgrimes EMIT(O_BACK, i); 6301573Srgrimes } else 6311573Srgrimes SETERROR(REG_ESUBREG); 6321573Srgrimes p->g->backrefs = 1; 6331573Srgrimes break; 6341573Srgrimes case '*': 63517141Sjkh (void)REQUIRE(starordinary, REG_BADRPT); 6361573Srgrimes /* FALLTHROUGH */ 6371573Srgrimes default: 63849094Sache ordinary(p, (char)c); 6391573Srgrimes break; 6401573Srgrimes } 6411573Srgrimes 6421573Srgrimes if (EAT('*')) { /* implemented as +? */ 6431573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 6441573Srgrimes INSERT(OPLUS_, pos); 6451573Srgrimes ASTERN(O_PLUS, pos); 6461573Srgrimes INSERT(OQUEST_, pos); 6471573Srgrimes ASTERN(O_QUEST, pos); 6481573Srgrimes } else if (EATTWO('\\', '{')) { 6491573Srgrimes count = p_count(p); 6501573Srgrimes if (EAT(',')) { 65149094Sache if (MORE() && isdigit((uch)PEEK())) { 6521573Srgrimes count2 = p_count(p); 65317141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 6541573Srgrimes } else /* single number with comma */ 6551573Srgrimes count2 = INFINITY; 6561573Srgrimes } else /* just a single number */ 6571573Srgrimes count2 = count; 6581573Srgrimes repeat(p, pos, count, count2); 6591573Srgrimes if (!EATTWO('\\', '}')) { /* error heuristics */ 6601573Srgrimes while (MORE() && !SEETWO('\\', '}')) 6611573Srgrimes NEXT(); 66217141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 6631573Srgrimes SETERROR(REG_BADBR); 6641573Srgrimes } 66549094Sache } else if (c == '$') /* $ (but not \$) ends it */ 6661573Srgrimes return(1); 6671573Srgrimes 6681573Srgrimes return(0); 6691573Srgrimes} 6701573Srgrimes 6711573Srgrimes/* 6721573Srgrimes - p_count - parse a repetition count 67392889Sobrien == static int p_count(struct parse *p); 6741573Srgrimes */ 6751573Srgrimesstatic int /* the value */ 6761573Srgrimesp_count(p) 67792889Sobrienstruct parse *p; 6781573Srgrimes{ 67992889Sobrien int count = 0; 68092889Sobrien int ndigits = 0; 6811573Srgrimes 68249094Sache while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 6831573Srgrimes count = count*10 + (GETNEXT() - '0'); 6841573Srgrimes ndigits++; 6851573Srgrimes } 6861573Srgrimes 68717141Sjkh (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 6881573Srgrimes return(count); 6891573Srgrimes} 6901573Srgrimes 6911573Srgrimes/* 6921573Srgrimes - p_bracket - parse a bracketed character list 69392889Sobrien == static void p_bracket(struct parse *p); 6941573Srgrimes * 6951573Srgrimes * Note a significant property of this code: if the allocset() did SETERROR, 6961573Srgrimes * no set operations are done. 6971573Srgrimes */ 6981573Srgrimesstatic void 6991573Srgrimesp_bracket(p) 70092889Sobrienstruct parse *p; 7011573Srgrimes{ 70292889Sobrien cset *cs = allocset(p); 70392889Sobrien int invert = 0; 7041573Srgrimes 7051573Srgrimes /* Dept of Truly Sickening Special-Case Kludges */ 7061573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 7071573Srgrimes EMIT(OBOW, 0); 7081573Srgrimes NEXTn(6); 7091573Srgrimes return; 7101573Srgrimes } 7111573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 7121573Srgrimes EMIT(OEOW, 0); 7131573Srgrimes NEXTn(6); 7141573Srgrimes return; 7151573Srgrimes } 7161573Srgrimes 7171573Srgrimes if (EAT('^')) 7181573Srgrimes invert++; /* make note to invert set at end */ 7191573Srgrimes if (EAT(']')) 7201573Srgrimes CHadd(cs, ']'); 7211573Srgrimes else if (EAT('-')) 7221573Srgrimes CHadd(cs, '-'); 7231573Srgrimes while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 7241573Srgrimes p_b_term(p, cs); 7251573Srgrimes if (EAT('-')) 7261573Srgrimes CHadd(cs, '-'); 72717141Sjkh (void)MUSTEAT(']', REG_EBRACK); 7281573Srgrimes 7291573Srgrimes if (p->error != 0) /* don't mess things up further */ 7301573Srgrimes return; 7311573Srgrimes 7321573Srgrimes if (p->g->cflags®_ICASE) { 73392889Sobrien int i; 73492889Sobrien int ci; 7351573Srgrimes 7361573Srgrimes for (i = p->g->csetsize - 1; i >= 0; i--) 7371573Srgrimes if (CHIN(cs, i) && isalpha(i)) { 7381573Srgrimes ci = othercase(i); 7391573Srgrimes if (ci != i) 7401573Srgrimes CHadd(cs, ci); 7411573Srgrimes } 7421573Srgrimes if (cs->multis != NULL) 7431573Srgrimes mccase(p, cs); 7441573Srgrimes } 7451573Srgrimes if (invert) { 74692889Sobrien int i; 7471573Srgrimes 7481573Srgrimes for (i = p->g->csetsize - 1; i >= 0; i--) 7491573Srgrimes if (CHIN(cs, i)) 7501573Srgrimes CHsub(cs, i); 7511573Srgrimes else 7521573Srgrimes CHadd(cs, i); 7531573Srgrimes if (p->g->cflags®_NEWLINE) 7541573Srgrimes CHsub(cs, '\n'); 7551573Srgrimes if (cs->multis != NULL) 7561573Srgrimes mcinvert(p, cs); 7571573Srgrimes } 7581573Srgrimes 7591573Srgrimes assert(cs->multis == NULL); /* xxx */ 7601573Srgrimes 7611573Srgrimes if (nch(p, cs) == 1) { /* optimize singleton sets */ 7621573Srgrimes ordinary(p, firstch(p, cs)); 7631573Srgrimes freeset(p, cs); 7641573Srgrimes } else 7651573Srgrimes EMIT(OANYOF, freezeset(p, cs)); 7661573Srgrimes} 7671573Srgrimes 7681573Srgrimes/* 7691573Srgrimes - p_b_term - parse one term of a bracketed character list 77092889Sobrien == static void p_b_term(struct parse *p, cset *cs); 7711573Srgrimes */ 7721573Srgrimesstatic void 7731573Srgrimesp_b_term(p, cs) 77492889Sobrienstruct parse *p; 77592889Sobriencset *cs; 7761573Srgrimes{ 77792889Sobrien char c; 77892889Sobrien char start, finish; 77992889Sobrien int i; 7801573Srgrimes 7811573Srgrimes /* classify what we've got */ 7821573Srgrimes switch ((MORE()) ? PEEK() : '\0') { 7831573Srgrimes case '[': 7841573Srgrimes c = (MORE2()) ? PEEK2() : '\0'; 7851573Srgrimes break; 7861573Srgrimes case '-': 7871573Srgrimes SETERROR(REG_ERANGE); 7881573Srgrimes return; /* NOTE RETURN */ 7891573Srgrimes break; 7901573Srgrimes default: 7911573Srgrimes c = '\0'; 7921573Srgrimes break; 7931573Srgrimes } 7941573Srgrimes 7951573Srgrimes switch (c) { 7961573Srgrimes case ':': /* character class */ 7971573Srgrimes NEXT2(); 79817141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 7991573Srgrimes c = PEEK(); 80017141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 8011573Srgrimes p_b_cclass(p, cs); 80217141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 80317141Sjkh (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 8041573Srgrimes break; 8051573Srgrimes case '=': /* equivalence class */ 8061573Srgrimes NEXT2(); 80717141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 8081573Srgrimes c = PEEK(); 80917141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 8101573Srgrimes p_b_eclass(p, cs); 81117141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 81217141Sjkh (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 8131573Srgrimes break; 8141573Srgrimes default: /* symbol, ordinary character, or range */ 8151573Srgrimes/* xxx revision needed for multichar stuff */ 8161573Srgrimes start = p_b_symbol(p); 8171573Srgrimes if (SEE('-') && MORE2() && PEEK2() != ']') { 8181573Srgrimes /* range */ 8191573Srgrimes NEXT(); 8201573Srgrimes if (EAT('-')) 8211573Srgrimes finish = '-'; 8221573Srgrimes else 8231573Srgrimes finish = p_b_symbol(p); 8241573Srgrimes } else 8251573Srgrimes finish = start; 82617514Sache if (start == finish) 82717514Sache CHadd(cs, start); 82817514Sache else { 82924637Sache if (__collate_load_error) { 83024637Sache (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 83124637Sache for (i = (uch)start; i <= (uch)finish; i++) 83217514Sache CHadd(cs, i); 83324637Sache } else { 83424637Sache (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 83524637Sache for (i = CHAR_MIN; i <= CHAR_MAX; i++) { 83624637Sache if ( __collate_range_cmp(start, i) <= 0 83724637Sache && __collate_range_cmp(i, finish) <= 0 83824637Sache ) 83924637Sache CHadd(cs, i); 84024637Sache } 84117514Sache } 84217514Sache } 8431573Srgrimes break; 8441573Srgrimes } 8451573Srgrimes} 8461573Srgrimes 8471573Srgrimes/* 8481573Srgrimes - p_b_cclass - parse a character-class name and deal with it 84992889Sobrien == static void p_b_cclass(struct parse *p, cset *cs); 8501573Srgrimes */ 8511573Srgrimesstatic void 8521573Srgrimesp_b_cclass(p, cs) 85392889Sobrienstruct parse *p; 85492889Sobriencset *cs; 8551573Srgrimes{ 85692889Sobrien int c; 85792889Sobrien char *sp = p->next; 85892889Sobrien struct cclass *cp; 85992889Sobrien size_t len; 8601573Srgrimes 86117508Sache while (MORE() && isalpha((uch)PEEK())) 8621573Srgrimes NEXT(); 8631573Srgrimes len = p->next - sp; 8641573Srgrimes for (cp = cclasses; cp->name != NULL; cp++) 8651573Srgrimes if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 8661573Srgrimes break; 8671573Srgrimes if (cp->name == NULL) { 8681573Srgrimes /* oops, didn't find it */ 8691573Srgrimes SETERROR(REG_ECTYPE); 8701573Srgrimes return; 8711573Srgrimes } 8721573Srgrimes 87317508Sache switch (cp->fidx) { 87417508Sache case CALNUM: 87517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 87617508Sache if (isalnum((uch)c)) 87717508Sache CHadd(cs, c); 87817508Sache break; 87917508Sache case CALPHA: 88017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 88117508Sache if (isalpha((uch)c)) 88217508Sache CHadd(cs, c); 88317508Sache break; 88417508Sache case CBLANK: 88517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 88617508Sache if (isblank((uch)c)) 88717508Sache CHadd(cs, c); 88817508Sache break; 88917508Sache case CCNTRL: 89017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 89117508Sache if (iscntrl((uch)c)) 89217508Sache CHadd(cs, c); 89317508Sache break; 89417508Sache case CDIGIT: 89517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 89617508Sache if (isdigit((uch)c)) 89717508Sache CHadd(cs, c); 89817508Sache break; 89917508Sache case CGRAPH: 90017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 90117508Sache if (isgraph((uch)c)) 90217508Sache CHadd(cs, c); 90317508Sache break; 90417508Sache case CLOWER: 90517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 90617508Sache if (islower((uch)c)) 90717508Sache CHadd(cs, c); 90817508Sache break; 90917508Sache case CPRINT: 91017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 91117508Sache if (isprint((uch)c)) 91217508Sache CHadd(cs, c); 91317508Sache break; 91417508Sache case CPUNCT: 91517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 91617508Sache if (ispunct((uch)c)) 91717508Sache CHadd(cs, c); 91817508Sache break; 91917508Sache case CSPACE: 92017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 92117508Sache if (isspace((uch)c)) 92217508Sache CHadd(cs, c); 92317508Sache break; 92417508Sache case CUPPER: 92517508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 92617508Sache if (isupper((uch)c)) 92717508Sache CHadd(cs, c); 92817508Sache break; 92917508Sache case CXDIGIT: 93017508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 93117508Sache if (isxdigit((uch)c)) 93217508Sache CHadd(cs, c); 93317508Sache break; 93417508Sache } 93517508Sache#if 0 9361573Srgrimes for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 9371573Srgrimes MCadd(p, cs, u); 93817508Sache#endif 9391573Srgrimes} 9401573Srgrimes 9411573Srgrimes/* 9421573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it 94392889Sobrien == static void p_b_eclass(struct parse *p, cset *cs); 9441573Srgrimes * 9451573Srgrimes * This implementation is incomplete. xxx 9461573Srgrimes */ 9471573Srgrimesstatic void 9481573Srgrimesp_b_eclass(p, cs) 94992889Sobrienstruct parse *p; 95092889Sobriencset *cs; 9511573Srgrimes{ 95292889Sobrien char c; 9531573Srgrimes 9541573Srgrimes c = p_b_coll_elem(p, '='); 9551573Srgrimes CHadd(cs, c); 9561573Srgrimes} 9571573Srgrimes 9581573Srgrimes/* 9591573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 96092889Sobrien == static char p_b_symbol(struct parse *p); 9611573Srgrimes */ 9621573Srgrimesstatic char /* value of symbol */ 9631573Srgrimesp_b_symbol(p) 96492889Sobrienstruct parse *p; 9651573Srgrimes{ 96692889Sobrien char value; 9671573Srgrimes 96817141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 9691573Srgrimes if (!EATTWO('[', '.')) 9701573Srgrimes return(GETNEXT()); 9711573Srgrimes 9721573Srgrimes /* collating symbol */ 9731573Srgrimes value = p_b_coll_elem(p, '.'); 97417141Sjkh (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 9751573Srgrimes return(value); 9761573Srgrimes} 9771573Srgrimes 9781573Srgrimes/* 9791573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up 98092889Sobrien == static char p_b_coll_elem(struct parse *p, int endc); 9811573Srgrimes */ 9821573Srgrimesstatic char /* value of collating element */ 9831573Srgrimesp_b_coll_elem(p, endc) 98492889Sobrienstruct parse *p; 9851573Srgrimesint endc; /* name ended by endc,']' */ 9861573Srgrimes{ 98792889Sobrien char *sp = p->next; 98892889Sobrien struct cname *cp; 98992889Sobrien int len; 9901573Srgrimes 9911573Srgrimes while (MORE() && !SEETWO(endc, ']')) 9921573Srgrimes NEXT(); 9931573Srgrimes if (!MORE()) { 9941573Srgrimes SETERROR(REG_EBRACK); 9951573Srgrimes return(0); 9961573Srgrimes } 9971573Srgrimes len = p->next - sp; 9981573Srgrimes for (cp = cnames; cp->name != NULL; cp++) 9991573Srgrimes if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 10001573Srgrimes return(cp->code); /* known name */ 10011573Srgrimes if (len == 1) 10021573Srgrimes return(*sp); /* single character */ 10031573Srgrimes SETERROR(REG_ECOLLATE); /* neither */ 10041573Srgrimes return(0); 10051573Srgrimes} 10061573Srgrimes 10071573Srgrimes/* 10081573Srgrimes - othercase - return the case counterpart of an alphabetic 10091573Srgrimes == static char othercase(int ch); 10101573Srgrimes */ 10111573Srgrimesstatic char /* if no counterpart, return ch */ 10121573Srgrimesothercase(ch) 10131573Srgrimesint ch; 10141573Srgrimes{ 101549094Sache ch = (uch)ch; 10161573Srgrimes assert(isalpha(ch)); 10171573Srgrimes if (isupper(ch)) 10181573Srgrimes return(tolower(ch)); 10191573Srgrimes else if (islower(ch)) 10201573Srgrimes return(toupper(ch)); 10211573Srgrimes else /* peculiar, but could happen */ 10221573Srgrimes return(ch); 10231573Srgrimes} 10241573Srgrimes 10251573Srgrimes/* 10261573Srgrimes - bothcases - emit a dualcase version of a two-case character 102792889Sobrien == static void bothcases(struct parse *p, int ch); 10281573Srgrimes * 10291573Srgrimes * Boy, is this implementation ever a kludge... 10301573Srgrimes */ 10311573Srgrimesstatic void 10321573Srgrimesbothcases(p, ch) 103392889Sobrienstruct parse *p; 10341573Srgrimesint ch; 10351573Srgrimes{ 103692889Sobrien char *oldnext = p->next; 103792889Sobrien char *oldend = p->end; 10381573Srgrimes char bracket[3]; 10391573Srgrimes 104049094Sache ch = (uch)ch; 10411573Srgrimes assert(othercase(ch) != ch); /* p_bracket() would recurse */ 10421573Srgrimes p->next = bracket; 10431573Srgrimes p->end = bracket+2; 10441573Srgrimes bracket[0] = ch; 10451573Srgrimes bracket[1] = ']'; 10461573Srgrimes bracket[2] = '\0'; 10471573Srgrimes p_bracket(p); 10481573Srgrimes assert(p->next == bracket+2); 10491573Srgrimes p->next = oldnext; 10501573Srgrimes p->end = oldend; 10511573Srgrimes} 10521573Srgrimes 10531573Srgrimes/* 10541573Srgrimes - ordinary - emit an ordinary character 105592889Sobrien == static void ordinary(struct parse *p, int ch); 10561573Srgrimes */ 10571573Srgrimesstatic void 10581573Srgrimesordinary(p, ch) 105992889Sobrienstruct parse *p; 106092889Sobrienint ch; 10611573Srgrimes{ 106292889Sobrien cat_t *cap = p->g->categories; 10631573Srgrimes 106449094Sache if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 10651573Srgrimes bothcases(p, ch); 10661573Srgrimes else { 106749094Sache EMIT(OCHAR, (uch)ch); 10681573Srgrimes if (cap[ch] == 0) 10691573Srgrimes cap[ch] = p->g->ncategories++; 10701573Srgrimes } 10711573Srgrimes} 10721573Srgrimes 10731573Srgrimes/* 10741573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY 107592889Sobrien == static void nonnewline(struct parse *p); 10761573Srgrimes * 10771573Srgrimes * Boy, is this implementation ever a kludge... 10781573Srgrimes */ 10791573Srgrimesstatic void 10801573Srgrimesnonnewline(p) 108192889Sobrienstruct parse *p; 10821573Srgrimes{ 108392889Sobrien char *oldnext = p->next; 108492889Sobrien char *oldend = p->end; 10851573Srgrimes char bracket[4]; 10861573Srgrimes 10871573Srgrimes p->next = bracket; 10881573Srgrimes p->end = bracket+3; 10891573Srgrimes bracket[0] = '^'; 10901573Srgrimes bracket[1] = '\n'; 10911573Srgrimes bracket[2] = ']'; 10921573Srgrimes bracket[3] = '\0'; 10931573Srgrimes p_bracket(p); 10941573Srgrimes assert(p->next == bracket+3); 10951573Srgrimes p->next = oldnext; 10961573Srgrimes p->end = oldend; 10971573Srgrimes} 10981573Srgrimes 10991573Srgrimes/* 11001573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed 110192889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to); 11021573Srgrimes */ 11031573Srgrimesstatic void 11041573Srgrimesrepeat(p, start, from, to) 110592889Sobrienstruct parse *p; 11061573Srgrimessopno start; /* operand from here to end of strip */ 11071573Srgrimesint from; /* repeated from this number */ 11081573Srgrimesint to; /* to this number of times (maybe INFINITY) */ 11091573Srgrimes{ 111092889Sobrien sopno finish = HERE(); 11111573Srgrimes# define N 2 11121573Srgrimes# define INF 3 11131573Srgrimes# define REP(f, t) ((f)*8 + (t)) 11141573Srgrimes# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 111592889Sobrien sopno copy; 11161573Srgrimes 11171573Srgrimes if (p->error != 0) /* head off possible runaway recursion */ 11181573Srgrimes return; 11191573Srgrimes 11201573Srgrimes assert(from <= to); 11211573Srgrimes 11221573Srgrimes switch (REP(MAP(from), MAP(to))) { 11231573Srgrimes case REP(0, 0): /* must be user doing this */ 11241573Srgrimes DROP(finish-start); /* drop the operand */ 11251573Srgrimes break; 11261573Srgrimes case REP(0, 1): /* as x{1,1}? */ 11271573Srgrimes case REP(0, N): /* as x{1,n}? */ 11281573Srgrimes case REP(0, INF): /* as x{1,}? */ 11291573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 11301573Srgrimes INSERT(OCH_, start); /* offset is wrong... */ 11311573Srgrimes repeat(p, start+1, 1, to); 11321573Srgrimes ASTERN(OOR1, start); 11331573Srgrimes AHEAD(start); /* ... fix it */ 11341573Srgrimes EMIT(OOR2, 0); 11351573Srgrimes AHEAD(THERE()); 11361573Srgrimes ASTERN(O_CH, THERETHERE()); 11371573Srgrimes break; 11381573Srgrimes case REP(1, 1): /* trivial case */ 11391573Srgrimes /* done */ 11401573Srgrimes break; 11411573Srgrimes case REP(1, N): /* as x?x{1,n-1} */ 11421573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 11431573Srgrimes INSERT(OCH_, start); 11441573Srgrimes ASTERN(OOR1, start); 11451573Srgrimes AHEAD(start); 11461573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 11471573Srgrimes AHEAD(THERE()); /* ...so fix it */ 11481573Srgrimes ASTERN(O_CH, THERETHERE()); 11491573Srgrimes copy = dupl(p, start+1, finish+1); 11501573Srgrimes assert(copy == finish+4); 11511573Srgrimes repeat(p, copy, 1, to-1); 11521573Srgrimes break; 11531573Srgrimes case REP(1, INF): /* as x+ */ 11541573Srgrimes INSERT(OPLUS_, start); 11551573Srgrimes ASTERN(O_PLUS, start); 11561573Srgrimes break; 11571573Srgrimes case REP(N, N): /* as xx{m-1,n-1} */ 11581573Srgrimes copy = dupl(p, start, finish); 11591573Srgrimes repeat(p, copy, from-1, to-1); 11601573Srgrimes break; 11611573Srgrimes case REP(N, INF): /* as xx{n-1,INF} */ 11621573Srgrimes copy = dupl(p, start, finish); 11631573Srgrimes repeat(p, copy, from-1, to); 11641573Srgrimes break; 11651573Srgrimes default: /* "can't happen" */ 11661573Srgrimes SETERROR(REG_ASSERT); /* just in case */ 11671573Srgrimes break; 11681573Srgrimes } 11691573Srgrimes} 11701573Srgrimes 11711573Srgrimes/* 11721573Srgrimes - seterr - set an error condition 117392889Sobrien == static int seterr(struct parse *p, int e); 11741573Srgrimes */ 11751573Srgrimesstatic int /* useless but makes type checking happy */ 11761573Srgrimesseterr(p, e) 117792889Sobrienstruct parse *p; 11781573Srgrimesint e; 11791573Srgrimes{ 11801573Srgrimes if (p->error == 0) /* keep earliest error condition */ 11811573Srgrimes p->error = e; 11821573Srgrimes p->next = nuls; /* try to bring things to a halt */ 11831573Srgrimes p->end = nuls; 11841573Srgrimes return(0); /* make the return value well-defined */ 11851573Srgrimes} 11861573Srgrimes 11871573Srgrimes/* 11881573Srgrimes - allocset - allocate a set of characters for [] 118992889Sobrien == static cset *allocset(struct parse *p); 11901573Srgrimes */ 11911573Srgrimesstatic cset * 11921573Srgrimesallocset(p) 119392889Sobrienstruct parse *p; 11941573Srgrimes{ 119592889Sobrien int no = p->g->ncsets++; 119692889Sobrien size_t nc; 119792889Sobrien size_t nbytes; 119892889Sobrien cset *cs; 119992889Sobrien size_t css = (size_t)p->g->csetsize; 120092889Sobrien int i; 12011573Srgrimes 12021573Srgrimes if (no >= p->ncsalloc) { /* need another column of space */ 12031573Srgrimes p->ncsalloc += CHAR_BIT; 12041573Srgrimes nc = p->ncsalloc; 12051573Srgrimes assert(nc % CHAR_BIT == 0); 12061573Srgrimes nbytes = nc / CHAR_BIT * css; 12071573Srgrimes if (p->g->sets == NULL) 12081573Srgrimes p->g->sets = (cset *)malloc(nc * sizeof(cset)); 12091573Srgrimes else 121039327Simp p->g->sets = (cset *)reallocf((char *)p->g->sets, 12111573Srgrimes nc * sizeof(cset)); 12121573Srgrimes if (p->g->setbits == NULL) 12131573Srgrimes p->g->setbits = (uch *)malloc(nbytes); 12141573Srgrimes else { 121539327Simp p->g->setbits = (uch *)reallocf((char *)p->g->setbits, 12161573Srgrimes nbytes); 12171573Srgrimes /* xxx this isn't right if setbits is now NULL */ 12181573Srgrimes for (i = 0; i < no; i++) 12191573Srgrimes p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 12201573Srgrimes } 12211573Srgrimes if (p->g->sets != NULL && p->g->setbits != NULL) 12221573Srgrimes (void) memset((char *)p->g->setbits + (nbytes - css), 12231573Srgrimes 0, css); 12241573Srgrimes else { 12251573Srgrimes no = 0; 12261573Srgrimes SETERROR(REG_ESPACE); 12271573Srgrimes /* caller's responsibility not to do set ops */ 12281573Srgrimes } 12291573Srgrimes } 12301573Srgrimes 12311573Srgrimes assert(p->g->sets != NULL); /* xxx */ 12321573Srgrimes cs = &p->g->sets[no]; 12331573Srgrimes cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 12341573Srgrimes cs->mask = 1 << ((no) % CHAR_BIT); 12351573Srgrimes cs->hash = 0; 12361573Srgrimes cs->smultis = 0; 12371573Srgrimes cs->multis = NULL; 12381573Srgrimes 12391573Srgrimes return(cs); 12401573Srgrimes} 12411573Srgrimes 12421573Srgrimes/* 12431573Srgrimes - freeset - free a now-unused set 124492889Sobrien == static void freeset(struct parse *p, cset *cs); 12451573Srgrimes */ 12461573Srgrimesstatic void 12471573Srgrimesfreeset(p, cs) 124892889Sobrienstruct parse *p; 124992889Sobriencset *cs; 12501573Srgrimes{ 125192889Sobrien int i; 125292889Sobrien cset *top = &p->g->sets[p->g->ncsets]; 125392889Sobrien size_t css = (size_t)p->g->csetsize; 12541573Srgrimes 12551573Srgrimes for (i = 0; i < css; i++) 12561573Srgrimes CHsub(cs, i); 12571573Srgrimes if (cs == top-1) /* recover only the easy case */ 12581573Srgrimes p->g->ncsets--; 12591573Srgrimes} 12601573Srgrimes 12611573Srgrimes/* 12621573Srgrimes - freezeset - final processing on a set of characters 126392889Sobrien == static int freezeset(struct parse *p, cset *cs); 12641573Srgrimes * 12651573Srgrimes * The main task here is merging identical sets. This is usually a waste 12661573Srgrimes * of time (although the hash code minimizes the overhead), but can win 12671573Srgrimes * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 12681573Srgrimes * is done using addition rather than xor -- all ASCII [aA] sets xor to 12691573Srgrimes * the same value! 12701573Srgrimes */ 12711573Srgrimesstatic int /* set number */ 12721573Srgrimesfreezeset(p, cs) 127392889Sobrienstruct parse *p; 127492889Sobriencset *cs; 12751573Srgrimes{ 127692889Sobrien short h = cs->hash; 127792889Sobrien int i; 127892889Sobrien cset *top = &p->g->sets[p->g->ncsets]; 127992889Sobrien cset *cs2; 128092889Sobrien size_t css = (size_t)p->g->csetsize; 12811573Srgrimes 12821573Srgrimes /* look for an earlier one which is the same */ 12831573Srgrimes for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 12841573Srgrimes if (cs2->hash == h && cs2 != cs) { 12851573Srgrimes /* maybe */ 12861573Srgrimes for (i = 0; i < css; i++) 12871573Srgrimes if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 12881573Srgrimes break; /* no */ 12891573Srgrimes if (i == css) 12901573Srgrimes break; /* yes */ 12911573Srgrimes } 12921573Srgrimes 12931573Srgrimes if (cs2 < top) { /* found one */ 12941573Srgrimes freeset(p, cs); 12951573Srgrimes cs = cs2; 12961573Srgrimes } 12971573Srgrimes 12981573Srgrimes return((int)(cs - p->g->sets)); 12991573Srgrimes} 13001573Srgrimes 13011573Srgrimes/* 13021573Srgrimes - firstch - return first character in a set (which must have at least one) 130392889Sobrien == static int firstch(struct parse *p, cset *cs); 13041573Srgrimes */ 13051573Srgrimesstatic int /* character; there is no "none" value */ 13061573Srgrimesfirstch(p, cs) 130792889Sobrienstruct parse *p; 130892889Sobriencset *cs; 13091573Srgrimes{ 131092889Sobrien int i; 131192889Sobrien size_t css = (size_t)p->g->csetsize; 13121573Srgrimes 13131573Srgrimes for (i = 0; i < css; i++) 13141573Srgrimes if (CHIN(cs, i)) 131549094Sache return((char)i); 13161573Srgrimes assert(never); 13171573Srgrimes return(0); /* arbitrary */ 13181573Srgrimes} 13191573Srgrimes 13201573Srgrimes/* 13211573Srgrimes - nch - number of characters in a set 132292889Sobrien == static int nch(struct parse *p, cset *cs); 13231573Srgrimes */ 13241573Srgrimesstatic int 13251573Srgrimesnch(p, cs) 132692889Sobrienstruct parse *p; 132792889Sobriencset *cs; 13281573Srgrimes{ 132992889Sobrien int i; 133092889Sobrien size_t css = (size_t)p->g->csetsize; 133192889Sobrien int n = 0; 13321573Srgrimes 13331573Srgrimes for (i = 0; i < css; i++) 13341573Srgrimes if (CHIN(cs, i)) 13351573Srgrimes n++; 13361573Srgrimes return(n); 13371573Srgrimes} 13381573Srgrimes 13391573Srgrimes/* 13401573Srgrimes - mcadd - add a collating element to a cset 134192889Sobrien == static void mcadd(struct parse *p, cset *cs, \ 134292889Sobrien == char *cp); 13431573Srgrimes */ 13441573Srgrimesstatic void 13451573Srgrimesmcadd(p, cs, cp) 134692889Sobrienstruct parse *p; 134792889Sobriencset *cs; 134892889Sobrienchar *cp; 13491573Srgrimes{ 135092889Sobrien size_t oldend = cs->smultis; 13511573Srgrimes 13521573Srgrimes cs->smultis += strlen(cp) + 1; 13531573Srgrimes if (cs->multis == NULL) 13541573Srgrimes cs->multis = malloc(cs->smultis); 13551573Srgrimes else 135639327Simp cs->multis = reallocf(cs->multis, cs->smultis); 13571573Srgrimes if (cs->multis == NULL) { 13581573Srgrimes SETERROR(REG_ESPACE); 13591573Srgrimes return; 13601573Srgrimes } 13611573Srgrimes 13621573Srgrimes (void) strcpy(cs->multis + oldend - 1, cp); 13631573Srgrimes cs->multis[cs->smultis - 1] = '\0'; 13641573Srgrimes} 13651573Srgrimes 136617141Sjkh#if used 13671573Srgrimes/* 13681573Srgrimes - mcsub - subtract a collating element from a cset 136992889Sobrien == static void mcsub(cset *cs, char *cp); 13701573Srgrimes */ 13711573Srgrimesstatic void 13721573Srgrimesmcsub(cs, cp) 137392889Sobriencset *cs; 137492889Sobrienchar *cp; 13751573Srgrimes{ 137692889Sobrien char *fp = mcfind(cs, cp); 137792889Sobrien size_t len = strlen(fp); 13781573Srgrimes 13791573Srgrimes assert(fp != NULL); 13801573Srgrimes (void) memmove(fp, fp + len + 1, 13811573Srgrimes cs->smultis - (fp + len + 1 - cs->multis)); 13821573Srgrimes cs->smultis -= len; 13831573Srgrimes 13841573Srgrimes if (cs->smultis == 0) { 13851573Srgrimes free(cs->multis); 13861573Srgrimes cs->multis = NULL; 13871573Srgrimes return; 13881573Srgrimes } 13891573Srgrimes 139039327Simp cs->multis = reallocf(cs->multis, cs->smultis); 13911573Srgrimes assert(cs->multis != NULL); 13921573Srgrimes} 13931573Srgrimes 13941573Srgrimes/* 13951573Srgrimes - mcin - is a collating element in a cset? 139692889Sobrien == static int mcin(cset *cs, char *cp); 13971573Srgrimes */ 13981573Srgrimesstatic int 13991573Srgrimesmcin(cs, cp) 140092889Sobriencset *cs; 140192889Sobrienchar *cp; 14021573Srgrimes{ 14031573Srgrimes return(mcfind(cs, cp) != NULL); 14041573Srgrimes} 14051573Srgrimes 14061573Srgrimes/* 14071573Srgrimes - mcfind - find a collating element in a cset 140892889Sobrien == static char *mcfind(cset *cs, char *cp); 14091573Srgrimes */ 14101573Srgrimesstatic char * 14111573Srgrimesmcfind(cs, cp) 141292889Sobriencset *cs; 141392889Sobrienchar *cp; 14141573Srgrimes{ 141592889Sobrien char *p; 14161573Srgrimes 14171573Srgrimes if (cs->multis == NULL) 14181573Srgrimes return(NULL); 14191573Srgrimes for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 14201573Srgrimes if (strcmp(cp, p) == 0) 14211573Srgrimes return(p); 14221573Srgrimes return(NULL); 14231573Srgrimes} 142417141Sjkh#endif 14251573Srgrimes 14261573Srgrimes/* 14271573Srgrimes - mcinvert - invert the list of collating elements in a cset 142892889Sobrien == static void mcinvert(struct parse *p, cset *cs); 14291573Srgrimes * 14301573Srgrimes * This would have to know the set of possibilities. Implementation 14311573Srgrimes * is deferred. 14321573Srgrimes */ 14331573Srgrimesstatic void 14341573Srgrimesmcinvert(p, cs) 143592889Sobrienstruct parse *p; 143692889Sobriencset *cs; 14371573Srgrimes{ 14381573Srgrimes assert(cs->multis == NULL); /* xxx */ 14391573Srgrimes} 14401573Srgrimes 14411573Srgrimes/* 14421573Srgrimes - mccase - add case counterparts of the list of collating elements in a cset 144392889Sobrien == static void mccase(struct parse *p, cset *cs); 14441573Srgrimes * 14451573Srgrimes * This would have to know the set of possibilities. Implementation 14461573Srgrimes * is deferred. 14471573Srgrimes */ 14481573Srgrimesstatic void 14491573Srgrimesmccase(p, cs) 145092889Sobrienstruct parse *p; 145192889Sobriencset *cs; 14521573Srgrimes{ 14531573Srgrimes assert(cs->multis == NULL); /* xxx */ 14541573Srgrimes} 14551573Srgrimes 14561573Srgrimes/* 14571573Srgrimes - isinsets - is this character in any sets? 145892889Sobrien == static int isinsets(struct re_guts *g, int c); 14591573Srgrimes */ 14601573Srgrimesstatic int /* predicate */ 14611573Srgrimesisinsets(g, c) 146292889Sobrienstruct re_guts *g; 14631573Srgrimesint c; 14641573Srgrimes{ 146592889Sobrien uch *col; 146692889Sobrien int i; 146792889Sobrien int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 146892889Sobrien unsigned uc = (uch)c; 14691573Srgrimes 14701573Srgrimes for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 14711573Srgrimes if (col[uc] != 0) 14721573Srgrimes return(1); 14731573Srgrimes return(0); 14741573Srgrimes} 14751573Srgrimes 14761573Srgrimes/* 14771573Srgrimes - samesets - are these two characters in exactly the same sets? 147892889Sobrien == static int samesets(struct re_guts *g, int c1, int c2); 14791573Srgrimes */ 14801573Srgrimesstatic int /* predicate */ 14811573Srgrimessamesets(g, c1, c2) 148292889Sobrienstruct re_guts *g; 14831573Srgrimesint c1; 14841573Srgrimesint c2; 14851573Srgrimes{ 148692889Sobrien uch *col; 148792889Sobrien int i; 148892889Sobrien int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 148992889Sobrien unsigned uc1 = (uch)c1; 149092889Sobrien unsigned uc2 = (uch)c2; 14911573Srgrimes 14921573Srgrimes for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 14931573Srgrimes if (col[uc1] != col[uc2]) 14941573Srgrimes return(0); 14951573Srgrimes return(1); 14961573Srgrimes} 14971573Srgrimes 14981573Srgrimes/* 14991573Srgrimes - categorize - sort out character categories 150092889Sobrien == static void categorize(struct parse *p, struct re_guts *g); 15011573Srgrimes */ 15021573Srgrimesstatic void 15031573Srgrimescategorize(p, g) 15041573Srgrimesstruct parse *p; 150592889Sobrienstruct re_guts *g; 15061573Srgrimes{ 150792889Sobrien cat_t *cats = g->categories; 150892889Sobrien int c; 150992889Sobrien int c2; 151092889Sobrien cat_t cat; 15111573Srgrimes 15121573Srgrimes /* avoid making error situations worse */ 15131573Srgrimes if (p->error != 0) 15141573Srgrimes return; 15151573Srgrimes 15161573Srgrimes for (c = CHAR_MIN; c <= CHAR_MAX; c++) 15171573Srgrimes if (cats[c] == 0 && isinsets(g, c)) { 15181573Srgrimes cat = g->ncategories++; 15191573Srgrimes cats[c] = cat; 15201573Srgrimes for (c2 = c+1; c2 <= CHAR_MAX; c2++) 15211573Srgrimes if (cats[c2] == 0 && samesets(g, c, c2)) 15221573Srgrimes cats[c2] = cat; 15231573Srgrimes } 15241573Srgrimes} 15251573Srgrimes 15261573Srgrimes/* 15271573Srgrimes - dupl - emit a duplicate of a bunch of sops 152892889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish); 15291573Srgrimes */ 15301573Srgrimesstatic sopno /* start of duplicate */ 15311573Srgrimesdupl(p, start, finish) 153292889Sobrienstruct parse *p; 15331573Srgrimessopno start; /* from here */ 15341573Srgrimessopno finish; /* to this less one */ 15351573Srgrimes{ 153692889Sobrien sopno ret = HERE(); 153792889Sobrien sopno len = finish - start; 15381573Srgrimes 15391573Srgrimes assert(finish >= start); 15401573Srgrimes if (len == 0) 15411573Srgrimes return(ret); 15421573Srgrimes enlarge(p, p->ssize + len); /* this many unexpected additions */ 15431573Srgrimes assert(p->ssize >= p->slen + len); 15441573Srgrimes (void) memcpy((char *)(p->strip + p->slen), 15451573Srgrimes (char *)(p->strip + start), (size_t)len*sizeof(sop)); 15461573Srgrimes p->slen += len; 15471573Srgrimes return(ret); 15481573Srgrimes} 15491573Srgrimes 15501573Srgrimes/* 15511573Srgrimes - doemit - emit a strip operator 155292889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd); 15531573Srgrimes * 15541573Srgrimes * It might seem better to implement this as a macro with a function as 15551573Srgrimes * hard-case backup, but it's just too big and messy unless there are 15561573Srgrimes * some changes to the data structures. Maybe later. 15571573Srgrimes */ 15581573Srgrimesstatic void 15591573Srgrimesdoemit(p, op, opnd) 156092889Sobrienstruct parse *p; 15611573Srgrimessop op; 15621573Srgrimessize_t opnd; 15631573Srgrimes{ 15641573Srgrimes /* avoid making error situations worse */ 15651573Srgrimes if (p->error != 0) 15661573Srgrimes return; 15671573Srgrimes 15681573Srgrimes /* deal with oversize operands ("can't happen", more or less) */ 15691573Srgrimes assert(opnd < 1<<OPSHIFT); 15701573Srgrimes 15711573Srgrimes /* deal with undersized strip */ 15721573Srgrimes if (p->slen >= p->ssize) 15731573Srgrimes enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 15741573Srgrimes assert(p->slen < p->ssize); 15751573Srgrimes 15761573Srgrimes /* finally, it's all reduced to the easy case */ 15771573Srgrimes p->strip[p->slen++] = SOP(op, opnd); 15781573Srgrimes} 15791573Srgrimes 15801573Srgrimes/* 15811573Srgrimes - doinsert - insert a sop into the strip 158292889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 15831573Srgrimes */ 15841573Srgrimesstatic void 15851573Srgrimesdoinsert(p, op, opnd, pos) 158692889Sobrienstruct parse *p; 15871573Srgrimessop op; 15881573Srgrimessize_t opnd; 15891573Srgrimessopno pos; 15901573Srgrimes{ 159192889Sobrien sopno sn; 159292889Sobrien sop s; 159392889Sobrien int i; 15941573Srgrimes 15951573Srgrimes /* avoid making error situations worse */ 15961573Srgrimes if (p->error != 0) 15971573Srgrimes return; 15981573Srgrimes 15991573Srgrimes sn = HERE(); 16001573Srgrimes EMIT(op, opnd); /* do checks, ensure space */ 16011573Srgrimes assert(HERE() == sn+1); 16021573Srgrimes s = p->strip[sn]; 16031573Srgrimes 16041573Srgrimes /* adjust paren pointers */ 16051573Srgrimes assert(pos > 0); 16061573Srgrimes for (i = 1; i < NPAREN; i++) { 16071573Srgrimes if (p->pbegin[i] >= pos) { 16081573Srgrimes p->pbegin[i]++; 16091573Srgrimes } 16101573Srgrimes if (p->pend[i] >= pos) { 16111573Srgrimes p->pend[i]++; 16121573Srgrimes } 16131573Srgrimes } 16141573Srgrimes 16151573Srgrimes memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 16161573Srgrimes (HERE()-pos-1)*sizeof(sop)); 16171573Srgrimes p->strip[pos] = s; 16181573Srgrimes} 16191573Srgrimes 16201573Srgrimes/* 16211573Srgrimes - dofwd - complete a forward reference 162292889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value); 16231573Srgrimes */ 16241573Srgrimesstatic void 16251573Srgrimesdofwd(p, pos, value) 162692889Sobrienstruct parse *p; 162792889Sobriensopno pos; 16281573Srgrimessop value; 16291573Srgrimes{ 16301573Srgrimes /* avoid making error situations worse */ 16311573Srgrimes if (p->error != 0) 16321573Srgrimes return; 16331573Srgrimes 16341573Srgrimes assert(value < 1<<OPSHIFT); 16351573Srgrimes p->strip[pos] = OP(p->strip[pos]) | value; 16361573Srgrimes} 16371573Srgrimes 16381573Srgrimes/* 16391573Srgrimes - enlarge - enlarge the strip 164092889Sobrien == static void enlarge(struct parse *p, sopno size); 16411573Srgrimes */ 16421573Srgrimesstatic void 16431573Srgrimesenlarge(p, size) 164492889Sobrienstruct parse *p; 164592889Sobriensopno size; 16461573Srgrimes{ 164792889Sobrien sop *sp; 16481573Srgrimes 16491573Srgrimes if (p->ssize >= size) 16501573Srgrimes return; 16511573Srgrimes 16521573Srgrimes sp = (sop *)realloc(p->strip, size*sizeof(sop)); 16531573Srgrimes if (sp == NULL) { 16541573Srgrimes SETERROR(REG_ESPACE); 16551573Srgrimes return; 16561573Srgrimes } 16571573Srgrimes p->strip = sp; 16581573Srgrimes p->ssize = size; 16591573Srgrimes} 16601573Srgrimes 16611573Srgrimes/* 16621573Srgrimes - stripsnug - compact the strip 166392889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g); 16641573Srgrimes */ 16651573Srgrimesstatic void 16661573Srgrimesstripsnug(p, g) 166792889Sobrienstruct parse *p; 166892889Sobrienstruct re_guts *g; 16691573Srgrimes{ 16701573Srgrimes g->nstates = p->slen; 16711573Srgrimes g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 16721573Srgrimes if (g->strip == NULL) { 16731573Srgrimes SETERROR(REG_ESPACE); 16741573Srgrimes g->strip = p->strip; 16751573Srgrimes } 16761573Srgrimes} 16771573Srgrimes 16781573Srgrimes/* 16791573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string 168092889Sobrien == static void findmust(struct parse *p, struct re_guts *g); 16811573Srgrimes * 16821573Srgrimes * This algorithm could do fancy things like analyzing the operands of | 16831573Srgrimes * for common subsequences. Someday. This code is simple and finds most 16841573Srgrimes * of the interesting cases. 16851573Srgrimes * 16861573Srgrimes * Note that must and mlen got initialized during setup. 16871573Srgrimes */ 16881573Srgrimesstatic void 16891573Srgrimesfindmust(p, g) 16901573Srgrimesstruct parse *p; 169192889Sobrienstruct re_guts *g; 16921573Srgrimes{ 169392889Sobrien sop *scan; 16941573Srgrimes sop *start; 169592889Sobrien sop *newstart; 169692889Sobrien sopno newlen; 169792889Sobrien sop s; 169892889Sobrien char *cp; 169992889Sobrien sopno i; 170062391Sdcs int offset; 170162391Sdcs int cs, mccs; 17021573Srgrimes 17031573Srgrimes /* avoid making error situations worse */ 17041573Srgrimes if (p->error != 0) 17051573Srgrimes return; 17061573Srgrimes 170762391Sdcs /* Find out if we can handle OANYOF or not */ 170862391Sdcs mccs = 0; 170962391Sdcs for (cs = 0; cs < g->ncsets; cs++) 171062391Sdcs if (g->sets[cs].multis != NULL) 171162391Sdcs mccs = 1; 171262391Sdcs 17131573Srgrimes /* find the longest OCHAR sequence in strip */ 17141573Srgrimes newlen = 0; 171562391Sdcs offset = 0; 171662391Sdcs g->moffset = 0; 17171573Srgrimes scan = g->strip + 1; 17181573Srgrimes do { 17191573Srgrimes s = *scan++; 17201573Srgrimes switch (OP(s)) { 17211573Srgrimes case OCHAR: /* sequence member */ 17221573Srgrimes if (newlen == 0) /* new sequence */ 17231573Srgrimes newstart = scan - 1; 17241573Srgrimes newlen++; 17251573Srgrimes break; 17261573Srgrimes case OPLUS_: /* things that don't break one */ 17271573Srgrimes case OLPAREN: 17281573Srgrimes case ORPAREN: 17291573Srgrimes break; 17301573Srgrimes case OQUEST_: /* things that must be skipped */ 17311573Srgrimes case OCH_: 173262391Sdcs offset = altoffset(scan, offset, mccs); 17331573Srgrimes scan--; 17341573Srgrimes do { 17351573Srgrimes scan += OPND(s); 17361573Srgrimes s = *scan; 17371573Srgrimes /* assert() interferes w debug printouts */ 17381573Srgrimes if (OP(s) != O_QUEST && OP(s) != O_CH && 17391573Srgrimes OP(s) != OOR2) { 17401573Srgrimes g->iflags |= BAD; 17411573Srgrimes return; 17421573Srgrimes } 17431573Srgrimes } while (OP(s) != O_QUEST && OP(s) != O_CH); 17441573Srgrimes /* fallthrough */ 174562391Sdcs case OBOW: /* things that break a sequence */ 174662391Sdcs case OEOW: 174762391Sdcs case OBOL: 174862391Sdcs case OEOL: 174962391Sdcs case O_QUEST: 175062391Sdcs case O_CH: 175162391Sdcs case OEND: 17521573Srgrimes if (newlen > g->mlen) { /* ends one */ 17531573Srgrimes start = newstart; 17541573Srgrimes g->mlen = newlen; 175562391Sdcs if (offset > -1) { 175662391Sdcs g->moffset += offset; 175762391Sdcs offset = newlen; 175862391Sdcs } else 175962391Sdcs g->moffset = offset; 176062391Sdcs } else { 176162391Sdcs if (offset > -1) 176262391Sdcs offset += newlen; 17631573Srgrimes } 17641573Srgrimes newlen = 0; 17651573Srgrimes break; 176662391Sdcs case OANY: 176762391Sdcs if (newlen > g->mlen) { /* ends one */ 176862391Sdcs start = newstart; 176962391Sdcs g->mlen = newlen; 177062391Sdcs if (offset > -1) { 177162391Sdcs g->moffset += offset; 177262391Sdcs offset = newlen; 177362391Sdcs } else 177462391Sdcs g->moffset = offset; 177562391Sdcs } else { 177662391Sdcs if (offset > -1) 177762391Sdcs offset += newlen; 177862391Sdcs } 177962391Sdcs if (offset > -1) 178062391Sdcs offset++; 178162391Sdcs newlen = 0; 178262391Sdcs break; 178362391Sdcs case OANYOF: /* may or may not invalidate offset */ 178462391Sdcs /* First, everything as OANY */ 178562391Sdcs if (newlen > g->mlen) { /* ends one */ 178662391Sdcs start = newstart; 178762391Sdcs g->mlen = newlen; 178862391Sdcs if (offset > -1) { 178962391Sdcs g->moffset += offset; 179062391Sdcs offset = newlen; 179162391Sdcs } else 179262391Sdcs g->moffset = offset; 179362391Sdcs } else { 179462391Sdcs if (offset > -1) 179562391Sdcs offset += newlen; 179662391Sdcs } 179762391Sdcs if (offset > -1) 179862391Sdcs offset++; 179962391Sdcs newlen = 0; 180062391Sdcs /* And, now, if we found out we can't deal with 180162391Sdcs * it, make offset = -1. 180262391Sdcs */ 180362391Sdcs if (mccs) 180462391Sdcs offset = -1; 180562391Sdcs break; 180662391Sdcs default: 180762391Sdcs /* Anything here makes it impossible or too hard 180862391Sdcs * to calculate the offset -- so we give up; 180962391Sdcs * save the last known good offset, in case the 181062391Sdcs * must sequence doesn't occur later. 181162391Sdcs */ 181262391Sdcs if (newlen > g->mlen) { /* ends one */ 181362391Sdcs start = newstart; 181462391Sdcs g->mlen = newlen; 181562391Sdcs if (offset > -1) 181662391Sdcs g->moffset += offset; 181762391Sdcs else 181862391Sdcs g->moffset = offset; 181962391Sdcs } 182062391Sdcs offset = -1; 182162391Sdcs newlen = 0; 182262391Sdcs break; 18231573Srgrimes } 18241573Srgrimes } while (OP(s) != OEND); 18251573Srgrimes 182662391Sdcs if (g->mlen == 0) { /* there isn't one */ 182762391Sdcs g->moffset = -1; 18281573Srgrimes return; 182962391Sdcs } 18301573Srgrimes 18311573Srgrimes /* turn it into a character string */ 18321573Srgrimes g->must = malloc((size_t)g->mlen + 1); 18331573Srgrimes if (g->must == NULL) { /* argh; just forget it */ 18341573Srgrimes g->mlen = 0; 183562391Sdcs g->moffset = -1; 18361573Srgrimes return; 18371573Srgrimes } 18381573Srgrimes cp = g->must; 18391573Srgrimes scan = start; 18401573Srgrimes for (i = g->mlen; i > 0; i--) { 18411573Srgrimes while (OP(s = *scan++) != OCHAR) 18421573Srgrimes continue; 18431573Srgrimes assert(cp < g->must + g->mlen); 18441573Srgrimes *cp++ = (char)OPND(s); 18451573Srgrimes } 18461573Srgrimes assert(cp == g->must + g->mlen); 18471573Srgrimes *cp++ = '\0'; /* just on general principles */ 18481573Srgrimes} 18491573Srgrimes 18501573Srgrimes/* 185162391Sdcs - altoffset - choose biggest offset among multiple choices 185262673Sdcs == static int altoffset(sop *scan, int offset, int mccs); 185362391Sdcs * 185462391Sdcs * Compute, recursively if necessary, the largest offset among multiple 185562391Sdcs * re paths. 185662391Sdcs */ 185762391Sdcsstatic int 185862391Sdcsaltoffset(scan, offset, mccs) 185962391Sdcssop *scan; 186062391Sdcsint offset; 186162391Sdcsint mccs; 186262391Sdcs{ 186362391Sdcs int largest; 186462391Sdcs int try; 186562391Sdcs sop s; 186662391Sdcs 186762391Sdcs /* If we gave up already on offsets, return */ 186862391Sdcs if (offset == -1) 186962391Sdcs return -1; 187062391Sdcs 187162391Sdcs largest = 0; 187262391Sdcs try = 0; 187362391Sdcs s = *scan++; 187462391Sdcs while (OP(s) != O_QUEST && OP(s) != O_CH) { 187562391Sdcs switch (OP(s)) { 187662391Sdcs case OOR1: 187762391Sdcs if (try > largest) 187862391Sdcs largest = try; 187962391Sdcs try = 0; 188062391Sdcs break; 188162391Sdcs case OQUEST_: 188262391Sdcs case OCH_: 188362391Sdcs try = altoffset(scan, try, mccs); 188462391Sdcs if (try == -1) 188562391Sdcs return -1; 188662391Sdcs scan--; 188762391Sdcs do { 188862391Sdcs scan += OPND(s); 188962391Sdcs s = *scan; 189062391Sdcs if (OP(s) != O_QUEST && OP(s) != O_CH && 189162391Sdcs OP(s) != OOR2) 189262391Sdcs return -1; 189362391Sdcs } while (OP(s) != O_QUEST && OP(s) != O_CH); 189462855Sdcs /* We must skip to the next position, or we'll 189562855Sdcs * leave altoffset() too early. 189662855Sdcs */ 189762855Sdcs scan++; 189862391Sdcs break; 189962391Sdcs case OANYOF: 190062391Sdcs if (mccs) 190162391Sdcs return -1; 190262391Sdcs case OCHAR: 190362391Sdcs case OANY: 190462391Sdcs try++; 190562391Sdcs case OBOW: 190662391Sdcs case OEOW: 190762391Sdcs case OLPAREN: 190862391Sdcs case ORPAREN: 190962391Sdcs case OOR2: 191062391Sdcs break; 191162391Sdcs default: 191262391Sdcs try = -1; 191362391Sdcs break; 191462391Sdcs } 191562391Sdcs if (try == -1) 191662391Sdcs return -1; 191762391Sdcs s = *scan++; 191862391Sdcs } 191962391Sdcs 192062391Sdcs if (try > largest) 192162391Sdcs largest = try; 192262391Sdcs 192362391Sdcs return largest+offset; 192462391Sdcs} 192562391Sdcs 192662391Sdcs/* 192762232Sdcs - computejumps - compute char jumps for BM scan 192892889Sobrien == static void computejumps(struct parse *p, struct re_guts *g); 192962232Sdcs * 193062232Sdcs * This algorithm assumes g->must exists and is has size greater than 193162232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 193262232Sdcs * Sara Baase. 193362232Sdcs * 193462232Sdcs * A char jump is the number of characters one needs to jump based on 193562232Sdcs * the value of the character from the text that was mismatched. 193662232Sdcs */ 193762232Sdcsstatic void 193862232Sdcscomputejumps(p, g) 193962232Sdcsstruct parse *p; 194062232Sdcsstruct re_guts *g; 194162232Sdcs{ 194262232Sdcs int ch; 194362232Sdcs int mindex; 194462232Sdcs 194562232Sdcs /* Avoid making errors worse */ 194662232Sdcs if (p->error != 0) 194762232Sdcs return; 194862232Sdcs 194962848Sdcs g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 195062232Sdcs if (g->charjump == NULL) /* Not a fatal error */ 195162232Sdcs return; 195262754Sdcs /* Adjust for signed chars, if necessary */ 195362754Sdcs g->charjump = &g->charjump[-(CHAR_MIN)]; 195462232Sdcs 195562232Sdcs /* If the character does not exist in the pattern, the jump 195662232Sdcs * is equal to the number of characters in the pattern. 195762232Sdcs */ 195862754Sdcs for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 195962232Sdcs g->charjump[ch] = g->mlen; 196062232Sdcs 196162232Sdcs /* If the character does exist, compute the jump that would 196262232Sdcs * take us to the last character in the pattern equal to it 196362232Sdcs * (notice that we match right to left, so that last character 196462232Sdcs * is the first one that would be matched). 196562232Sdcs */ 196662232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 196762754Sdcs g->charjump[g->must[mindex]] = g->mlen - mindex - 1; 196862232Sdcs} 196962232Sdcs 197062232Sdcs/* 197162232Sdcs - computematchjumps - compute match jumps for BM scan 197292889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g); 197362232Sdcs * 197462232Sdcs * This algorithm assumes g->must exists and is has size greater than 197562232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 197662232Sdcs * Sara Baase. 197762232Sdcs * 197862232Sdcs * A match jump is the number of characters one needs to advance based 197962232Sdcs * on the already-matched suffix. 198062232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way 198162232Sdcs * the search algorithm works. 198262232Sdcs */ 198362232Sdcsstatic void 198462232Sdcscomputematchjumps(p, g) 198562232Sdcsstruct parse *p; 198662232Sdcsstruct re_guts *g; 198762232Sdcs{ 198862232Sdcs int mindex; /* General "must" iterator */ 198962232Sdcs int suffix; /* Keeps track of matching suffix */ 199062232Sdcs int ssuffix; /* Keeps track of suffixes' suffix */ 199162232Sdcs int* pmatches; /* pmatches[k] points to the next i 199262232Sdcs * such that i+1...mlen is a substring 199362232Sdcs * of k+1...k+mlen-i-1 199462232Sdcs */ 199562232Sdcs 199662232Sdcs /* Avoid making errors worse */ 199762232Sdcs if (p->error != 0) 199862232Sdcs return; 199962232Sdcs 200062848Sdcs pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 200162232Sdcs if (pmatches == NULL) { 200262232Sdcs g->matchjump = NULL; 200362232Sdcs return; 200462232Sdcs } 200562232Sdcs 200662848Sdcs g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 200762232Sdcs if (g->matchjump == NULL) /* Not a fatal error */ 200862232Sdcs return; 200962232Sdcs 201062232Sdcs /* Set maximum possible jump for each character in the pattern */ 201162232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 201262232Sdcs g->matchjump[mindex] = 2*g->mlen - mindex - 1; 201362232Sdcs 201462232Sdcs /* Compute pmatches[] */ 201562232Sdcs for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 201662232Sdcs mindex--, suffix--) { 201762232Sdcs pmatches[mindex] = suffix; 201862232Sdcs 201962232Sdcs /* If a mismatch is found, interrupting the substring, 202062232Sdcs * compute the matchjump for that position. If no 202162232Sdcs * mismatch is found, then a text substring mismatched 202262232Sdcs * against the suffix will also mismatch against the 202362232Sdcs * substring. 202462232Sdcs */ 202562232Sdcs while (suffix < g->mlen 202662232Sdcs && g->must[mindex] != g->must[suffix]) { 202762232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 202862232Sdcs g->mlen - mindex - 1); 202962232Sdcs suffix = pmatches[suffix]; 203062232Sdcs } 203162232Sdcs } 203262232Sdcs 203362232Sdcs /* Compute the matchjump up to the last substring found to jump 203462232Sdcs * to the beginning of the largest must pattern prefix matching 203562232Sdcs * it's own suffix. 203662232Sdcs */ 203762232Sdcs for (mindex = 0; mindex <= suffix; mindex++) 203862232Sdcs g->matchjump[mindex] = MIN(g->matchjump[mindex], 203962232Sdcs g->mlen + suffix - mindex); 204062232Sdcs 204162232Sdcs ssuffix = pmatches[suffix]; 204262391Sdcs while (suffix < g->mlen) { 204362673Sdcs while (suffix <= ssuffix && suffix < g->mlen) { 204462232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 204562232Sdcs g->mlen + ssuffix - suffix); 204662232Sdcs suffix++; 204762232Sdcs } 204886208Sdcs if (suffix < g->mlen) 204986208Sdcs ssuffix = pmatches[ssuffix]; 205062232Sdcs } 205162232Sdcs 205262232Sdcs free(pmatches); 205362232Sdcs} 205462232Sdcs 205562232Sdcs/* 20561573Srgrimes - pluscount - count + nesting 205792889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g); 20581573Srgrimes */ 20591573Srgrimesstatic sopno /* nesting depth */ 20601573Srgrimespluscount(p, g) 20611573Srgrimesstruct parse *p; 206292889Sobrienstruct re_guts *g; 20631573Srgrimes{ 206492889Sobrien sop *scan; 206592889Sobrien sop s; 206692889Sobrien sopno plusnest = 0; 206792889Sobrien sopno maxnest = 0; 20681573Srgrimes 20691573Srgrimes if (p->error != 0) 20701573Srgrimes return(0); /* there may not be an OEND */ 20711573Srgrimes 20721573Srgrimes scan = g->strip + 1; 20731573Srgrimes do { 20741573Srgrimes s = *scan++; 20751573Srgrimes switch (OP(s)) { 20761573Srgrimes case OPLUS_: 20771573Srgrimes plusnest++; 20781573Srgrimes break; 20791573Srgrimes case O_PLUS: 20801573Srgrimes if (plusnest > maxnest) 20811573Srgrimes maxnest = plusnest; 20821573Srgrimes plusnest--; 20831573Srgrimes break; 20841573Srgrimes } 20851573Srgrimes } while (OP(s) != OEND); 20861573Srgrimes if (plusnest != 0) 20871573Srgrimes g->iflags |= BAD; 20881573Srgrimes return(maxnest); 20891573Srgrimes} 2090