regcomp.c revision 227435
11573Srgrimes/*- 21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer. 31573Srgrimes * Copyright (c) 1992, 1993, 1994 41573Srgrimes * The Regents of the University of California. All rights reserved. 51573Srgrimes * 61573Srgrimes * This code is derived from software contributed to Berkeley by 71573Srgrimes * Henry Spencer. 81573Srgrimes * 91573Srgrimes * Redistribution and use in source and binary forms, with or without 101573Srgrimes * modification, are permitted provided that the following conditions 111573Srgrimes * are met: 121573Srgrimes * 1. Redistributions of source code must retain the above copyright 131573Srgrimes * notice, this list of conditions and the following disclaimer. 141573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 151573Srgrimes * notice, this list of conditions and the following disclaimer in the 161573Srgrimes * documentation and/or other materials provided with the distribution. 171573Srgrimes * 4. Neither the name of the University nor the names of its contributors 181573Srgrimes * may be used to endorse or promote products derived from this software 191573Srgrimes * without specific prior written permission. 201573Srgrimes * 211573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 221573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 231573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 241573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 251573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 261573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 271573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 281573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 291573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 301573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 311573Srgrimes * SUCH DAMAGE. 321573Srgrimes * 331573Srgrimes * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 341573Srgrimes */ 351573Srgrimes 361573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 371573Srgrimesstatic char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 381573Srgrimes#endif /* LIBC_SCCS and not lint */ 3992986Sobrien#include <sys/cdefs.h> 4092986Sobrien__FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 227435 2011-11-11 01:35:07Z kevlo $"); 411573Srgrimes 421573Srgrimes#include <sys/types.h> 431573Srgrimes#include <stdio.h> 441573Srgrimes#include <string.h> 451573Srgrimes#include <ctype.h> 461573Srgrimes#include <limits.h> 471573Srgrimes#include <stdlib.h> 481573Srgrimes#include <regex.h> 49136091Sstefanf#include <runetype.h> 50132019Stjr#include <wchar.h> 51132019Stjr#include <wctype.h> 521573Srgrimes 5319277Sache#include "collate.h" 5419277Sache 551573Srgrimes#include "utils.h" 561573Srgrimes#include "regex2.h" 571573Srgrimes 581573Srgrimes#include "cname.h" 591573Srgrimes 601573Srgrimes/* 611573Srgrimes * parse structure, passed up and down to avoid global variables and 621573Srgrimes * other clumsinesses 631573Srgrimes */ 641573Srgrimesstruct parse { 651573Srgrimes char *next; /* next character in RE */ 661573Srgrimes char *end; /* end of string (-> NUL normally) */ 671573Srgrimes int error; /* has an error been seen? */ 681573Srgrimes sop *strip; /* malloced strip */ 691573Srgrimes sopno ssize; /* malloced strip size (allocated) */ 701573Srgrimes sopno slen; /* malloced strip length (used) */ 711573Srgrimes int ncsalloc; /* number of csets allocated */ 721573Srgrimes struct re_guts *g; 731573Srgrimes# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 741573Srgrimes sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 751573Srgrimes sopno pend[NPAREN]; /* -> ) ([0] unused) */ 761573Srgrimes}; 771573Srgrimes 781573Srgrimes/* ========= begin header generated by ./mkh ========= */ 791573Srgrimes#ifdef __cplusplus 801573Srgrimesextern "C" { 811573Srgrimes#endif 821573Srgrimes 831573Srgrimes/* === regcomp.c === */ 84227435Skevlostatic void p_ere(struct parse *p, int stop); 8592905Sobrienstatic void p_ere_exp(struct parse *p); 8692905Sobrienstatic void p_str(struct parse *p); 87227435Skevlostatic void p_bre(struct parse *p, int end1, int end2); 8892905Sobrienstatic int p_simp_re(struct parse *p, int starordinary); 8992905Sobrienstatic int p_count(struct parse *p); 9092905Sobrienstatic void p_bracket(struct parse *p); 9192905Sobrienstatic void p_b_term(struct parse *p, cset *cs); 9292905Sobrienstatic void p_b_cclass(struct parse *p, cset *cs); 9392905Sobrienstatic void p_b_eclass(struct parse *p, cset *cs); 94132019Stjrstatic wint_t p_b_symbol(struct parse *p); 95132019Stjrstatic wint_t p_b_coll_elem(struct parse *p, wint_t endc); 96132019Stjrstatic wint_t othercase(wint_t ch); 97132019Stjrstatic void bothcases(struct parse *p, wint_t ch); 98132019Stjrstatic void ordinary(struct parse *p, wint_t ch); 9992905Sobrienstatic void nonnewline(struct parse *p); 10092905Sobrienstatic void repeat(struct parse *p, sopno start, int from, int to); 10192905Sobrienstatic int seterr(struct parse *p, int e); 10292905Sobrienstatic cset *allocset(struct parse *p); 10392905Sobrienstatic void freeset(struct parse *p, cset *cs); 104132019Stjrstatic void CHadd(struct parse *p, cset *cs, wint_t ch); 105132019Stjrstatic void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 106132019Stjrstatic void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 107132019Stjrstatic wint_t singleton(cset *cs); 10892905Sobrienstatic sopno dupl(struct parse *p, sopno start, sopno finish); 10992905Sobrienstatic void doemit(struct parse *p, sop op, size_t opnd); 11092905Sobrienstatic void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 11192905Sobrienstatic void dofwd(struct parse *p, sopno pos, sop value); 112227414Skevlostatic int enlarge(struct parse *p, sopno size); 11392905Sobrienstatic void stripsnug(struct parse *p, struct re_guts *g); 11492905Sobrienstatic void findmust(struct parse *p, struct re_guts *g); 115131973Stjrstatic int altoffset(sop *scan, int offset); 11692905Sobrienstatic void computejumps(struct parse *p, struct re_guts *g); 11792905Sobrienstatic void computematchjumps(struct parse *p, struct re_guts *g); 11892905Sobrienstatic sopno pluscount(struct parse *p, struct re_guts *g); 119132019Stjrstatic wint_t wgetnext(struct parse *p); 1201573Srgrimes 1211573Srgrimes#ifdef __cplusplus 1221573Srgrimes} 1231573Srgrimes#endif 1241573Srgrimes/* ========= end header generated by ./mkh ========= */ 1251573Srgrimes 1261573Srgrimesstatic char nuls[10]; /* place to point scanner in event of error */ 1271573Srgrimes 1281573Srgrimes/* 1291573Srgrimes * macros for use with parse structure 1301573Srgrimes * BEWARE: these know that the parse structure is named `p' !!! 1311573Srgrimes */ 1321573Srgrimes#define PEEK() (*p->next) 1331573Srgrimes#define PEEK2() (*(p->next+1)) 1341573Srgrimes#define MORE() (p->next < p->end) 1351573Srgrimes#define MORE2() (p->next+1 < p->end) 1361573Srgrimes#define SEE(c) (MORE() && PEEK() == (c)) 1371573Srgrimes#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 1381573Srgrimes#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 1391573Srgrimes#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 1401573Srgrimes#define NEXT() (p->next++) 1411573Srgrimes#define NEXT2() (p->next += 2) 1421573Srgrimes#define NEXTn(n) (p->next += (n)) 1431573Srgrimes#define GETNEXT() (*p->next++) 144132019Stjr#define WGETNEXT() wgetnext(p) 1451573Srgrimes#define SETERROR(e) seterr(p, (e)) 1461573Srgrimes#define REQUIRE(co, e) ((co) || SETERROR(e)) 1471573Srgrimes#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 1481573Srgrimes#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 1491573Srgrimes#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 1501573Srgrimes#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 1511573Srgrimes#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 1521573Srgrimes#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 1531573Srgrimes#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 1541573Srgrimes#define HERE() (p->slen) 1551573Srgrimes#define THERE() (p->slen - 1) 1561573Srgrimes#define THERETHERE() (p->slen - 2) 1571573Srgrimes#define DROP(n) (p->slen -= (n)) 1581573Srgrimes 1591573Srgrimes#ifndef NDEBUG 1601573Srgrimesstatic int never = 0; /* for use in asserts; shuts lint up */ 1611573Srgrimes#else 1621573Srgrimes#define never 0 /* some <assert.h>s have bugs too */ 1631573Srgrimes#endif 1641573Srgrimes 16562232Sdcs/* Macro used by computejump()/computematchjump() */ 16662232Sdcs#define MIN(a,b) ((a)<(b)?(a):(b)) 16762232Sdcs 1681573Srgrimes/* 1691573Srgrimes - regcomp - interface for parser and compilation 1701573Srgrimes = extern int regcomp(regex_t *, const char *, int); 1711573Srgrimes = #define REG_BASIC 0000 1721573Srgrimes = #define REG_EXTENDED 0001 1731573Srgrimes = #define REG_ICASE 0002 1741573Srgrimes = #define REG_NOSUB 0004 1751573Srgrimes = #define REG_NEWLINE 0010 1761573Srgrimes = #define REG_NOSPEC 0020 1771573Srgrimes = #define REG_PEND 0040 1781573Srgrimes = #define REG_DUMP 0200 1791573Srgrimes */ 1801573Srgrimesint /* 0 success, otherwise REG_something */ 181170528Sdelphijregcomp(regex_t * __restrict preg, 182170528Sdelphij const char * __restrict pattern, 183170528Sdelphij int cflags) 1841573Srgrimes{ 1851573Srgrimes struct parse pa; 18692889Sobrien struct re_guts *g; 18792889Sobrien struct parse *p = &pa; 18892889Sobrien int i; 18992889Sobrien size_t len; 1901573Srgrimes#ifdef REDEBUG 1911573Srgrimes# define GOODFLAGS(f) (f) 1921573Srgrimes#else 1931573Srgrimes# define GOODFLAGS(f) ((f)&~REG_DUMP) 1941573Srgrimes#endif 1951573Srgrimes 1961573Srgrimes cflags = GOODFLAGS(cflags); 1971573Srgrimes if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 1981573Srgrimes return(REG_INVARG); 1991573Srgrimes 2001573Srgrimes if (cflags®_PEND) { 2011573Srgrimes if (preg->re_endp < pattern) 2021573Srgrimes return(REG_INVARG); 2031573Srgrimes len = preg->re_endp - pattern; 2041573Srgrimes } else 2051573Srgrimes len = strlen((char *)pattern); 2061573Srgrimes 2071573Srgrimes /* do the mallocs early so failure handling is easy */ 208131973Stjr g = (struct re_guts *)malloc(sizeof(struct re_guts)); 2091573Srgrimes if (g == NULL) 2101573Srgrimes return(REG_ESPACE); 2111573Srgrimes p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 2121573Srgrimes p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 2131573Srgrimes p->slen = 0; 2141573Srgrimes if (p->strip == NULL) { 2151573Srgrimes free((char *)g); 2161573Srgrimes return(REG_ESPACE); 2171573Srgrimes } 2181573Srgrimes 2191573Srgrimes /* set things up */ 2201573Srgrimes p->g = g; 2211573Srgrimes p->next = (char *)pattern; /* convenience; we do not modify it */ 2221573Srgrimes p->end = p->next + len; 2231573Srgrimes p->error = 0; 2241573Srgrimes p->ncsalloc = 0; 2251573Srgrimes for (i = 0; i < NPAREN; i++) { 2261573Srgrimes p->pbegin[i] = 0; 2271573Srgrimes p->pend[i] = 0; 2281573Srgrimes } 2291573Srgrimes g->sets = NULL; 2301573Srgrimes g->ncsets = 0; 2311573Srgrimes g->cflags = cflags; 2321573Srgrimes g->iflags = 0; 2331573Srgrimes g->nbol = 0; 2341573Srgrimes g->neol = 0; 2351573Srgrimes g->must = NULL; 23662391Sdcs g->moffset = -1; 23762263Sdcs g->charjump = NULL; 23862263Sdcs g->matchjump = NULL; 2391573Srgrimes g->mlen = 0; 2401573Srgrimes g->nsub = 0; 2411573Srgrimes g->backrefs = 0; 2421573Srgrimes 2431573Srgrimes /* do it */ 2441573Srgrimes EMIT(OEND, 0); 2451573Srgrimes g->firststate = THERE(); 2461573Srgrimes if (cflags®_EXTENDED) 2471573Srgrimes p_ere(p, OUT); 2481573Srgrimes else if (cflags®_NOSPEC) 2491573Srgrimes p_str(p); 2501573Srgrimes else 2511573Srgrimes p_bre(p, OUT, OUT); 2521573Srgrimes EMIT(OEND, 0); 2531573Srgrimes g->laststate = THERE(); 2541573Srgrimes 2551573Srgrimes /* tidy up loose ends and fill things in */ 2561573Srgrimes stripsnug(p, g); 2571573Srgrimes findmust(p, g); 25862232Sdcs /* only use Boyer-Moore algorithm if the pattern is bigger 25962232Sdcs * than three characters 26062232Sdcs */ 26162232Sdcs if(g->mlen > 3) { 26262232Sdcs computejumps(p, g); 26362232Sdcs computematchjumps(p, g); 26462755Sdcs if(g->matchjump == NULL && g->charjump != NULL) { 26562232Sdcs free(g->charjump); 26662232Sdcs g->charjump = NULL; 26762232Sdcs } 26862232Sdcs } 2691573Srgrimes g->nplus = pluscount(p, g); 2701573Srgrimes g->magic = MAGIC2; 2711573Srgrimes preg->re_nsub = g->nsub; 2721573Srgrimes preg->re_g = g; 2731573Srgrimes preg->re_magic = MAGIC1; 2741573Srgrimes#ifndef REDEBUG 2751573Srgrimes /* not debugging, so can't rely on the assert() in regexec() */ 2761573Srgrimes if (g->iflags&BAD) 2771573Srgrimes SETERROR(REG_ASSERT); 2781573Srgrimes#endif 2791573Srgrimes 2801573Srgrimes /* win or lose, we're done */ 2811573Srgrimes if (p->error != 0) /* lose */ 2821573Srgrimes regfree(preg); 2831573Srgrimes return(p->error); 2841573Srgrimes} 2851573Srgrimes 2861573Srgrimes/* 2871573Srgrimes - p_ere - ERE parser top level, concatenation and alternation 288227435Skevlo == static void p_ere(struct parse *p, int_t stop); 2891573Srgrimes */ 2901573Srgrimesstatic void 291170528Sdelphijp_ere(struct parse *p, 292227435Skevlo int stop) /* character this ERE should end at */ 2931573Srgrimes{ 29492889Sobrien char c; 29592889Sobrien sopno prevback; 29692889Sobrien sopno prevfwd; 29792889Sobrien sopno conc; 29892889Sobrien int first = 1; /* is this the first alternative? */ 2991573Srgrimes 3001573Srgrimes for (;;) { 3011573Srgrimes /* do a bunch of concatenated expressions */ 3021573Srgrimes conc = HERE(); 3031573Srgrimes while (MORE() && (c = PEEK()) != '|' && c != stop) 3041573Srgrimes p_ere_exp(p); 30517141Sjkh (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 3061573Srgrimes 3071573Srgrimes if (!EAT('|')) 3081573Srgrimes break; /* NOTE BREAK OUT */ 3091573Srgrimes 3101573Srgrimes if (first) { 3111573Srgrimes INSERT(OCH_, conc); /* offset is wrong */ 3121573Srgrimes prevfwd = conc; 3131573Srgrimes prevback = conc; 3141573Srgrimes first = 0; 3151573Srgrimes } 3161573Srgrimes ASTERN(OOR1, prevback); 3171573Srgrimes prevback = THERE(); 3181573Srgrimes AHEAD(prevfwd); /* fix previous offset */ 3191573Srgrimes prevfwd = HERE(); 3201573Srgrimes EMIT(OOR2, 0); /* offset is very wrong */ 3211573Srgrimes } 3221573Srgrimes 3231573Srgrimes if (!first) { /* tail-end fixups */ 3241573Srgrimes AHEAD(prevfwd); 3251573Srgrimes ASTERN(O_CH, prevback); 3261573Srgrimes } 3271573Srgrimes 3281573Srgrimes assert(!MORE() || SEE(stop)); 3291573Srgrimes} 3301573Srgrimes 3311573Srgrimes/* 3321573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 33392889Sobrien == static void p_ere_exp(struct parse *p); 3341573Srgrimes */ 3351573Srgrimesstatic void 336170528Sdelphijp_ere_exp(struct parse *p) 3371573Srgrimes{ 33892889Sobrien char c; 339132019Stjr wint_t wc; 34092889Sobrien sopno pos; 34192889Sobrien int count; 34292889Sobrien int count2; 34392889Sobrien sopno subno; 3441573Srgrimes int wascaret = 0; 3451573Srgrimes 3461573Srgrimes assert(MORE()); /* caller should have ensured this */ 3471573Srgrimes c = GETNEXT(); 3481573Srgrimes 3491573Srgrimes pos = HERE(); 3501573Srgrimes switch (c) { 3511573Srgrimes case '(': 35217141Sjkh (void)REQUIRE(MORE(), REG_EPAREN); 3531573Srgrimes p->g->nsub++; 3541573Srgrimes subno = p->g->nsub; 3551573Srgrimes if (subno < NPAREN) 3561573Srgrimes p->pbegin[subno] = HERE(); 3571573Srgrimes EMIT(OLPAREN, subno); 3581573Srgrimes if (!SEE(')')) 3591573Srgrimes p_ere(p, ')'); 3601573Srgrimes if (subno < NPAREN) { 3611573Srgrimes p->pend[subno] = HERE(); 3621573Srgrimes assert(p->pend[subno] != 0); 3631573Srgrimes } 3641573Srgrimes EMIT(ORPAREN, subno); 36517141Sjkh (void)MUSTEAT(')', REG_EPAREN); 3661573Srgrimes break; 3671573Srgrimes#ifndef POSIX_MISTAKE 3681573Srgrimes case ')': /* happens only if no current unmatched ( */ 3691573Srgrimes /* 3701573Srgrimes * You may ask, why the ifndef? Because I didn't notice 3711573Srgrimes * this until slightly too late for 1003.2, and none of the 3721573Srgrimes * other 1003.2 regular-expression reviewers noticed it at 3731573Srgrimes * all. So an unmatched ) is legal POSIX, at least until 3741573Srgrimes * we can get it fixed. 3751573Srgrimes */ 3761573Srgrimes SETERROR(REG_EPAREN); 3771573Srgrimes break; 3781573Srgrimes#endif 3791573Srgrimes case '^': 3801573Srgrimes EMIT(OBOL, 0); 3811573Srgrimes p->g->iflags |= USEBOL; 3821573Srgrimes p->g->nbol++; 3831573Srgrimes wascaret = 1; 3841573Srgrimes break; 3851573Srgrimes case '$': 3861573Srgrimes EMIT(OEOL, 0); 3871573Srgrimes p->g->iflags |= USEEOL; 3881573Srgrimes p->g->neol++; 3891573Srgrimes break; 3901573Srgrimes case '|': 3911573Srgrimes SETERROR(REG_EMPTY); 3921573Srgrimes break; 3931573Srgrimes case '*': 3941573Srgrimes case '+': 3951573Srgrimes case '?': 3961573Srgrimes SETERROR(REG_BADRPT); 3971573Srgrimes break; 3981573Srgrimes case '.': 3991573Srgrimes if (p->g->cflags®_NEWLINE) 4001573Srgrimes nonnewline(p); 4011573Srgrimes else 4021573Srgrimes EMIT(OANY, 0); 4031573Srgrimes break; 4041573Srgrimes case '[': 4051573Srgrimes p_bracket(p); 4061573Srgrimes break; 4071573Srgrimes case '\\': 40817141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 409132019Stjr wc = WGETNEXT(); 410132019Stjr ordinary(p, wc); 4111573Srgrimes break; 4121573Srgrimes case '{': /* okay as ordinary except if digit follows */ 41349094Sache (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 4141573Srgrimes /* FALLTHROUGH */ 4151573Srgrimes default: 416132019Stjr p->next--; 417132019Stjr wc = WGETNEXT(); 418132019Stjr ordinary(p, wc); 4191573Srgrimes break; 4201573Srgrimes } 4211573Srgrimes 4221573Srgrimes if (!MORE()) 4231573Srgrimes return; 4241573Srgrimes c = PEEK(); 4251573Srgrimes /* we call { a repetition if followed by a digit */ 4261573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 42749094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 4281573Srgrimes return; /* no repetition, we're done */ 4291573Srgrimes NEXT(); 4301573Srgrimes 43117141Sjkh (void)REQUIRE(!wascaret, REG_BADRPT); 4321573Srgrimes switch (c) { 4331573Srgrimes case '*': /* implemented as +? */ 4341573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 4351573Srgrimes INSERT(OPLUS_, pos); 4361573Srgrimes ASTERN(O_PLUS, pos); 4371573Srgrimes INSERT(OQUEST_, pos); 4381573Srgrimes ASTERN(O_QUEST, pos); 4391573Srgrimes break; 4401573Srgrimes case '+': 4411573Srgrimes INSERT(OPLUS_, pos); 4421573Srgrimes ASTERN(O_PLUS, pos); 4431573Srgrimes break; 4441573Srgrimes case '?': 4451573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 4461573Srgrimes INSERT(OCH_, pos); /* offset slightly wrong */ 4471573Srgrimes ASTERN(OOR1, pos); /* this one's right */ 4481573Srgrimes AHEAD(pos); /* fix the OCH_ */ 4491573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 4501573Srgrimes AHEAD(THERE()); /* ...so fix it */ 4511573Srgrimes ASTERN(O_CH, THERETHERE()); 4521573Srgrimes break; 4531573Srgrimes case '{': 4541573Srgrimes count = p_count(p); 4551573Srgrimes if (EAT(',')) { 45649094Sache if (isdigit((uch)PEEK())) { 4571573Srgrimes count2 = p_count(p); 45817141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 4591573Srgrimes } else /* single number with comma */ 4601573Srgrimes count2 = INFINITY; 4611573Srgrimes } else /* just a single number */ 4621573Srgrimes count2 = count; 4631573Srgrimes repeat(p, pos, count, count2); 4641573Srgrimes if (!EAT('}')) { /* error heuristics */ 4651573Srgrimes while (MORE() && PEEK() != '}') 4661573Srgrimes NEXT(); 46717141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 4681573Srgrimes SETERROR(REG_BADBR); 4691573Srgrimes } 4701573Srgrimes break; 4711573Srgrimes } 4721573Srgrimes 4731573Srgrimes if (!MORE()) 4741573Srgrimes return; 4751573Srgrimes c = PEEK(); 4761573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 47749094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 4781573Srgrimes return; 4791573Srgrimes SETERROR(REG_BADRPT); 4801573Srgrimes} 4811573Srgrimes 4821573Srgrimes/* 4831573Srgrimes - p_str - string (no metacharacters) "parser" 48492889Sobrien == static void p_str(struct parse *p); 4851573Srgrimes */ 4861573Srgrimesstatic void 487170528Sdelphijp_str(struct parse *p) 4881573Srgrimes{ 48917141Sjkh (void)REQUIRE(MORE(), REG_EMPTY); 4901573Srgrimes while (MORE()) 491132019Stjr ordinary(p, WGETNEXT()); 4921573Srgrimes} 4931573Srgrimes 4941573Srgrimes/* 4951573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation 496227435Skevlo == static void p_bre(struct parse *p, int end1, \ 497227435Skevlo == int end2); 4981573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check. 4991573Srgrimes * 5001573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first 501131973Stjr * taken as an ordinary character and then revised to be an anchor. 5021573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive. 5031573Srgrimes */ 5041573Srgrimesstatic void 505170528Sdelphijp_bre(struct parse *p, 506227435Skevlo int end1, /* first terminating character */ 507227435Skevlo int end2) /* second terminating character */ 5081573Srgrimes{ 50992889Sobrien sopno start = HERE(); 51092889Sobrien int first = 1; /* first subexpression? */ 51192889Sobrien int wasdollar = 0; 5121573Srgrimes 5131573Srgrimes if (EAT('^')) { 5141573Srgrimes EMIT(OBOL, 0); 5151573Srgrimes p->g->iflags |= USEBOL; 5161573Srgrimes p->g->nbol++; 5171573Srgrimes } 5181573Srgrimes while (MORE() && !SEETWO(end1, end2)) { 5191573Srgrimes wasdollar = p_simp_re(p, first); 5201573Srgrimes first = 0; 5211573Srgrimes } 5221573Srgrimes if (wasdollar) { /* oops, that was a trailing anchor */ 5231573Srgrimes DROP(1); 5241573Srgrimes EMIT(OEOL, 0); 5251573Srgrimes p->g->iflags |= USEEOL; 5261573Srgrimes p->g->neol++; 5271573Srgrimes } 5281573Srgrimes 52917141Sjkh (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 5301573Srgrimes} 5311573Srgrimes 5321573Srgrimes/* 5331573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 53492889Sobrien == static int p_simp_re(struct parse *p, int starordinary); 5351573Srgrimes */ 5361573Srgrimesstatic int /* was the simple RE an unbackslashed $? */ 537170528Sdelphijp_simp_re(struct parse *p, 538170528Sdelphij int starordinary) /* is a leading * an ordinary character? */ 5391573Srgrimes{ 54092889Sobrien int c; 54192889Sobrien int count; 54292889Sobrien int count2; 54392889Sobrien sopno pos; 54492889Sobrien int i; 545132019Stjr wint_t wc; 54692889Sobrien sopno subno; 5471573Srgrimes# define BACKSL (1<<CHAR_BIT) 5481573Srgrimes 5491573Srgrimes pos = HERE(); /* repetion op, if any, covers from here */ 5501573Srgrimes 5511573Srgrimes assert(MORE()); /* caller should have ensured this */ 5521573Srgrimes c = GETNEXT(); 5531573Srgrimes if (c == '\\') { 55417141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 55549094Sache c = BACKSL | GETNEXT(); 5561573Srgrimes } 5571573Srgrimes switch (c) { 5581573Srgrimes case '.': 5591573Srgrimes if (p->g->cflags®_NEWLINE) 5601573Srgrimes nonnewline(p); 5611573Srgrimes else 5621573Srgrimes EMIT(OANY, 0); 5631573Srgrimes break; 5641573Srgrimes case '[': 5651573Srgrimes p_bracket(p); 5661573Srgrimes break; 5671573Srgrimes case BACKSL|'{': 5681573Srgrimes SETERROR(REG_BADRPT); 5691573Srgrimes break; 5701573Srgrimes case BACKSL|'(': 5711573Srgrimes p->g->nsub++; 5721573Srgrimes subno = p->g->nsub; 5731573Srgrimes if (subno < NPAREN) 5741573Srgrimes p->pbegin[subno] = HERE(); 5751573Srgrimes EMIT(OLPAREN, subno); 5761573Srgrimes /* the MORE here is an error heuristic */ 5771573Srgrimes if (MORE() && !SEETWO('\\', ')')) 5781573Srgrimes p_bre(p, '\\', ')'); 5791573Srgrimes if (subno < NPAREN) { 5801573Srgrimes p->pend[subno] = HERE(); 5811573Srgrimes assert(p->pend[subno] != 0); 5821573Srgrimes } 5831573Srgrimes EMIT(ORPAREN, subno); 58417141Sjkh (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 5851573Srgrimes break; 5861573Srgrimes case BACKSL|')': /* should not get here -- must be user */ 5871573Srgrimes case BACKSL|'}': 5881573Srgrimes SETERROR(REG_EPAREN); 5891573Srgrimes break; 5901573Srgrimes case BACKSL|'1': 5911573Srgrimes case BACKSL|'2': 5921573Srgrimes case BACKSL|'3': 5931573Srgrimes case BACKSL|'4': 5941573Srgrimes case BACKSL|'5': 5951573Srgrimes case BACKSL|'6': 5961573Srgrimes case BACKSL|'7': 5971573Srgrimes case BACKSL|'8': 5981573Srgrimes case BACKSL|'9': 5991573Srgrimes i = (c&~BACKSL) - '0'; 6001573Srgrimes assert(i < NPAREN); 6011573Srgrimes if (p->pend[i] != 0) { 6021573Srgrimes assert(i <= p->g->nsub); 6031573Srgrimes EMIT(OBACK_, i); 6041573Srgrimes assert(p->pbegin[i] != 0); 6051573Srgrimes assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 6061573Srgrimes assert(OP(p->strip[p->pend[i]]) == ORPAREN); 6071573Srgrimes (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 6081573Srgrimes EMIT(O_BACK, i); 6091573Srgrimes } else 6101573Srgrimes SETERROR(REG_ESUBREG); 6111573Srgrimes p->g->backrefs = 1; 6121573Srgrimes break; 6131573Srgrimes case '*': 61417141Sjkh (void)REQUIRE(starordinary, REG_BADRPT); 6151573Srgrimes /* FALLTHROUGH */ 6161573Srgrimes default: 617132019Stjr p->next--; 618132019Stjr wc = WGETNEXT(); 619132019Stjr ordinary(p, wc); 6201573Srgrimes break; 6211573Srgrimes } 6221573Srgrimes 6231573Srgrimes if (EAT('*')) { /* implemented as +? */ 6241573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 6251573Srgrimes INSERT(OPLUS_, pos); 6261573Srgrimes ASTERN(O_PLUS, pos); 6271573Srgrimes INSERT(OQUEST_, pos); 6281573Srgrimes ASTERN(O_QUEST, pos); 6291573Srgrimes } else if (EATTWO('\\', '{')) { 6301573Srgrimes count = p_count(p); 6311573Srgrimes if (EAT(',')) { 63249094Sache if (MORE() && isdigit((uch)PEEK())) { 6331573Srgrimes count2 = p_count(p); 63417141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 6351573Srgrimes } else /* single number with comma */ 6361573Srgrimes count2 = INFINITY; 6371573Srgrimes } else /* just a single number */ 6381573Srgrimes count2 = count; 6391573Srgrimes repeat(p, pos, count, count2); 6401573Srgrimes if (!EATTWO('\\', '}')) { /* error heuristics */ 6411573Srgrimes while (MORE() && !SEETWO('\\', '}')) 6421573Srgrimes NEXT(); 64317141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 6441573Srgrimes SETERROR(REG_BADBR); 6451573Srgrimes } 64649094Sache } else if (c == '$') /* $ (but not \$) ends it */ 6471573Srgrimes return(1); 6481573Srgrimes 6491573Srgrimes return(0); 6501573Srgrimes} 6511573Srgrimes 6521573Srgrimes/* 6531573Srgrimes - p_count - parse a repetition count 65492889Sobrien == static int p_count(struct parse *p); 6551573Srgrimes */ 6561573Srgrimesstatic int /* the value */ 657170528Sdelphijp_count(struct parse *p) 6581573Srgrimes{ 65992889Sobrien int count = 0; 66092889Sobrien int ndigits = 0; 6611573Srgrimes 66249094Sache while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 6631573Srgrimes count = count*10 + (GETNEXT() - '0'); 6641573Srgrimes ndigits++; 6651573Srgrimes } 6661573Srgrimes 66717141Sjkh (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 6681573Srgrimes return(count); 6691573Srgrimes} 6701573Srgrimes 6711573Srgrimes/* 6721573Srgrimes - p_bracket - parse a bracketed character list 67392889Sobrien == static void p_bracket(struct parse *p); 6741573Srgrimes */ 6751573Srgrimesstatic void 676170528Sdelphijp_bracket(struct parse *p) 6771573Srgrimes{ 678132019Stjr cset *cs; 679132019Stjr wint_t ch; 6801573Srgrimes 6811573Srgrimes /* Dept of Truly Sickening Special-Case Kludges */ 6821573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 6831573Srgrimes EMIT(OBOW, 0); 6841573Srgrimes NEXTn(6); 6851573Srgrimes return; 6861573Srgrimes } 6871573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 6881573Srgrimes EMIT(OEOW, 0); 6891573Srgrimes NEXTn(6); 6901573Srgrimes return; 6911573Srgrimes } 6921573Srgrimes 693132019Stjr if ((cs = allocset(p)) == NULL) 694132019Stjr return; 695132019Stjr 696132019Stjr if (p->g->cflags®_ICASE) 697132019Stjr cs->icase = 1; 6981573Srgrimes if (EAT('^')) 699132019Stjr cs->invert = 1; 7001573Srgrimes if (EAT(']')) 701132019Stjr CHadd(p, cs, ']'); 7021573Srgrimes else if (EAT('-')) 703132019Stjr CHadd(p, cs, '-'); 7041573Srgrimes while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 7051573Srgrimes p_b_term(p, cs); 7061573Srgrimes if (EAT('-')) 707132019Stjr CHadd(p, cs, '-'); 70817141Sjkh (void)MUSTEAT(']', REG_EBRACK); 7091573Srgrimes 7101573Srgrimes if (p->error != 0) /* don't mess things up further */ 7111573Srgrimes return; 7121573Srgrimes 713132019Stjr if (cs->invert && p->g->cflags®_NEWLINE) 714132019Stjr cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 7151573Srgrimes 716132019Stjr if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 717132019Stjr ordinary(p, ch); 7181573Srgrimes freeset(p, cs); 7191573Srgrimes } else 720132019Stjr EMIT(OANYOF, (int)(cs - p->g->sets)); 7211573Srgrimes} 7221573Srgrimes 7231573Srgrimes/* 7241573Srgrimes - p_b_term - parse one term of a bracketed character list 72592889Sobrien == static void p_b_term(struct parse *p, cset *cs); 7261573Srgrimes */ 7271573Srgrimesstatic void 728170528Sdelphijp_b_term(struct parse *p, cset *cs) 7291573Srgrimes{ 73092889Sobrien char c; 731132019Stjr wint_t start, finish; 732132019Stjr wint_t i; 7331573Srgrimes 7341573Srgrimes /* classify what we've got */ 7351573Srgrimes switch ((MORE()) ? PEEK() : '\0') { 7361573Srgrimes case '[': 7371573Srgrimes c = (MORE2()) ? PEEK2() : '\0'; 7381573Srgrimes break; 7391573Srgrimes case '-': 7401573Srgrimes SETERROR(REG_ERANGE); 7411573Srgrimes return; /* NOTE RETURN */ 7421573Srgrimes break; 7431573Srgrimes default: 7441573Srgrimes c = '\0'; 7451573Srgrimes break; 7461573Srgrimes } 7471573Srgrimes 7481573Srgrimes switch (c) { 7491573Srgrimes case ':': /* character class */ 7501573Srgrimes NEXT2(); 75117141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 7521573Srgrimes c = PEEK(); 75317141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 7541573Srgrimes p_b_cclass(p, cs); 75517141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 75617141Sjkh (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 7571573Srgrimes break; 7581573Srgrimes case '=': /* equivalence class */ 7591573Srgrimes NEXT2(); 76017141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 7611573Srgrimes c = PEEK(); 76217141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 7631573Srgrimes p_b_eclass(p, cs); 76417141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 76517141Sjkh (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 7661573Srgrimes break; 7671573Srgrimes default: /* symbol, ordinary character, or range */ 7681573Srgrimes start = p_b_symbol(p); 7691573Srgrimes if (SEE('-') && MORE2() && PEEK2() != ']') { 7701573Srgrimes /* range */ 7711573Srgrimes NEXT(); 7721573Srgrimes if (EAT('-')) 7731573Srgrimes finish = '-'; 7741573Srgrimes else 7751573Srgrimes finish = p_b_symbol(p); 7761573Srgrimes } else 7771573Srgrimes finish = start; 77817514Sache if (start == finish) 779132019Stjr CHadd(p, cs, start); 78017514Sache else { 78124637Sache if (__collate_load_error) { 78224637Sache (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 783132019Stjr CHaddrange(p, cs, start, finish); 78424637Sache } else { 78524637Sache (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 786132019Stjr for (i = 0; i <= UCHAR_MAX; i++) { 78724637Sache if ( __collate_range_cmp(start, i) <= 0 78824637Sache && __collate_range_cmp(i, finish) <= 0 78924637Sache ) 790132019Stjr CHadd(p, cs, i); 79124637Sache } 79217514Sache } 79317514Sache } 7941573Srgrimes break; 7951573Srgrimes } 7961573Srgrimes} 7971573Srgrimes 7981573Srgrimes/* 7991573Srgrimes - p_b_cclass - parse a character-class name and deal with it 80092889Sobrien == static void p_b_cclass(struct parse *p, cset *cs); 8011573Srgrimes */ 8021573Srgrimesstatic void 803170528Sdelphijp_b_cclass(struct parse *p, cset *cs) 8041573Srgrimes{ 80592889Sobrien char *sp = p->next; 80692889Sobrien size_t len; 807132019Stjr wctype_t wct; 808132019Stjr char clname[16]; 8091573Srgrimes 81017508Sache while (MORE() && isalpha((uch)PEEK())) 8111573Srgrimes NEXT(); 8121573Srgrimes len = p->next - sp; 813132019Stjr if (len >= sizeof(clname) - 1) { 8141573Srgrimes SETERROR(REG_ECTYPE); 8151573Srgrimes return; 8161573Srgrimes } 817132019Stjr memcpy(clname, sp, len); 818132019Stjr clname[len] = '\0'; 819132019Stjr if ((wct = wctype(clname)) == 0) { 820132019Stjr SETERROR(REG_ECTYPE); 821132019Stjr return; 82217508Sache } 823132019Stjr CHaddtype(p, cs, wct); 8241573Srgrimes} 8251573Srgrimes 8261573Srgrimes/* 8271573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it 82892889Sobrien == static void p_b_eclass(struct parse *p, cset *cs); 8291573Srgrimes * 8301573Srgrimes * This implementation is incomplete. xxx 8311573Srgrimes */ 8321573Srgrimesstatic void 833170528Sdelphijp_b_eclass(struct parse *p, cset *cs) 8341573Srgrimes{ 835132019Stjr wint_t c; 8361573Srgrimes 8371573Srgrimes c = p_b_coll_elem(p, '='); 838132019Stjr CHadd(p, cs, c); 8391573Srgrimes} 8401573Srgrimes 8411573Srgrimes/* 8421573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 843227414Skevlo == static wint_t p_b_symbol(struct parse *p); 8441573Srgrimes */ 845132019Stjrstatic wint_t /* value of symbol */ 846170528Sdelphijp_b_symbol(struct parse *p) 8471573Srgrimes{ 848132019Stjr wint_t value; 8491573Srgrimes 85017141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 8511573Srgrimes if (!EATTWO('[', '.')) 852132019Stjr return(WGETNEXT()); 8531573Srgrimes 8541573Srgrimes /* collating symbol */ 8551573Srgrimes value = p_b_coll_elem(p, '.'); 85617141Sjkh (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 8571573Srgrimes return(value); 8581573Srgrimes} 8591573Srgrimes 8601573Srgrimes/* 8611573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up 862227414Skevlo == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 8631573Srgrimes */ 864132019Stjrstatic wint_t /* value of collating element */ 865170528Sdelphijp_b_coll_elem(struct parse *p, 866170528Sdelphij wint_t endc) /* name ended by endc,']' */ 8671573Srgrimes{ 86892889Sobrien char *sp = p->next; 86992889Sobrien struct cname *cp; 87092889Sobrien int len; 871132019Stjr mbstate_t mbs; 872132019Stjr wchar_t wc; 873132019Stjr size_t clen; 8741573Srgrimes 8751573Srgrimes while (MORE() && !SEETWO(endc, ']')) 8761573Srgrimes NEXT(); 8771573Srgrimes if (!MORE()) { 8781573Srgrimes SETERROR(REG_EBRACK); 8791573Srgrimes return(0); 8801573Srgrimes } 8811573Srgrimes len = p->next - sp; 8821573Srgrimes for (cp = cnames; cp->name != NULL; cp++) 8831573Srgrimes if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 8841573Srgrimes return(cp->code); /* known name */ 885132019Stjr memset(&mbs, 0, sizeof(mbs)); 886132019Stjr if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 887132019Stjr return (wc); /* single character */ 888132019Stjr else if (clen == (size_t)-1 || clen == (size_t)-2) 889132019Stjr SETERROR(REG_ILLSEQ); 890132019Stjr else 891132019Stjr SETERROR(REG_ECOLLATE); /* neither */ 8921573Srgrimes return(0); 8931573Srgrimes} 8941573Srgrimes 8951573Srgrimes/* 8961573Srgrimes - othercase - return the case counterpart of an alphabetic 897227414Skevlo == static wint_t othercase(wint_t ch); 8981573Srgrimes */ 899132019Stjrstatic wint_t /* if no counterpart, return ch */ 900170528Sdelphijothercase(wint_t ch) 9011573Srgrimes{ 902132019Stjr assert(iswalpha(ch)); 903132019Stjr if (iswupper(ch)) 904132019Stjr return(towlower(ch)); 905132019Stjr else if (iswlower(ch)) 906132019Stjr return(towupper(ch)); 9071573Srgrimes else /* peculiar, but could happen */ 9081573Srgrimes return(ch); 9091573Srgrimes} 9101573Srgrimes 9111573Srgrimes/* 9121573Srgrimes - bothcases - emit a dualcase version of a two-case character 913227414Skevlo == static void bothcases(struct parse *p, wint_t ch); 9141573Srgrimes * 9151573Srgrimes * Boy, is this implementation ever a kludge... 9161573Srgrimes */ 9171573Srgrimesstatic void 918170528Sdelphijbothcases(struct parse *p, wint_t ch) 9191573Srgrimes{ 92092889Sobrien char *oldnext = p->next; 92192889Sobrien char *oldend = p->end; 922132019Stjr char bracket[3 + MB_LEN_MAX]; 923132019Stjr size_t n; 924132019Stjr mbstate_t mbs; 9251573Srgrimes 9261573Srgrimes assert(othercase(ch) != ch); /* p_bracket() would recurse */ 9271573Srgrimes p->next = bracket; 928132019Stjr memset(&mbs, 0, sizeof(mbs)); 929132019Stjr n = wcrtomb(bracket, ch, &mbs); 930132019Stjr assert(n != (size_t)-1); 931132019Stjr bracket[n] = ']'; 932132019Stjr bracket[n + 1] = '\0'; 933132019Stjr p->end = bracket+n+1; 9341573Srgrimes p_bracket(p); 935132019Stjr assert(p->next == p->end); 9361573Srgrimes p->next = oldnext; 9371573Srgrimes p->end = oldend; 9381573Srgrimes} 9391573Srgrimes 9401573Srgrimes/* 9411573Srgrimes - ordinary - emit an ordinary character 942227414Skevlo == static void ordinary(struct parse *p, wint_t ch); 9431573Srgrimes */ 9441573Srgrimesstatic void 945170528Sdelphijordinary(struct parse *p, wint_t ch) 9461573Srgrimes{ 947132019Stjr cset *cs; 9481573Srgrimes 949132019Stjr if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 9501573Srgrimes bothcases(p, ch); 951132019Stjr else if ((ch & OPDMASK) == ch) 952132019Stjr EMIT(OCHAR, ch); 953132019Stjr else { 954132019Stjr /* 955132019Stjr * Kludge: character is too big to fit into an OCHAR operand. 956132019Stjr * Emit a singleton set. 957132019Stjr */ 958132019Stjr if ((cs = allocset(p)) == NULL) 959132019Stjr return; 960132019Stjr CHadd(p, cs, ch); 961132019Stjr EMIT(OANYOF, (int)(cs - p->g->sets)); 962132019Stjr } 9631573Srgrimes} 9641573Srgrimes 9651573Srgrimes/* 9661573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY 96792889Sobrien == static void nonnewline(struct parse *p); 9681573Srgrimes * 9691573Srgrimes * Boy, is this implementation ever a kludge... 9701573Srgrimes */ 9711573Srgrimesstatic void 972170528Sdelphijnonnewline(struct parse *p) 9731573Srgrimes{ 97492889Sobrien char *oldnext = p->next; 97592889Sobrien char *oldend = p->end; 9761573Srgrimes char bracket[4]; 9771573Srgrimes 9781573Srgrimes p->next = bracket; 9791573Srgrimes p->end = bracket+3; 9801573Srgrimes bracket[0] = '^'; 9811573Srgrimes bracket[1] = '\n'; 9821573Srgrimes bracket[2] = ']'; 9831573Srgrimes bracket[3] = '\0'; 9841573Srgrimes p_bracket(p); 9851573Srgrimes assert(p->next == bracket+3); 9861573Srgrimes p->next = oldnext; 9871573Srgrimes p->end = oldend; 9881573Srgrimes} 9891573Srgrimes 9901573Srgrimes/* 9911573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed 99292889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to); 9931573Srgrimes */ 9941573Srgrimesstatic void 995170528Sdelphijrepeat(struct parse *p, 996170528Sdelphij sopno start, /* operand from here to end of strip */ 997170528Sdelphij int from, /* repeated from this number */ 998170528Sdelphij int to) /* to this number of times (maybe INFINITY) */ 9991573Srgrimes{ 100092889Sobrien sopno finish = HERE(); 10011573Srgrimes# define N 2 10021573Srgrimes# define INF 3 10031573Srgrimes# define REP(f, t) ((f)*8 + (t)) 10041573Srgrimes# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 100592889Sobrien sopno copy; 10061573Srgrimes 10071573Srgrimes if (p->error != 0) /* head off possible runaway recursion */ 10081573Srgrimes return; 10091573Srgrimes 10101573Srgrimes assert(from <= to); 10111573Srgrimes 10121573Srgrimes switch (REP(MAP(from), MAP(to))) { 10131573Srgrimes case REP(0, 0): /* must be user doing this */ 10141573Srgrimes DROP(finish-start); /* drop the operand */ 10151573Srgrimes break; 10161573Srgrimes case REP(0, 1): /* as x{1,1}? */ 10171573Srgrimes case REP(0, N): /* as x{1,n}? */ 10181573Srgrimes case REP(0, INF): /* as x{1,}? */ 10191573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10201573Srgrimes INSERT(OCH_, start); /* offset is wrong... */ 10211573Srgrimes repeat(p, start+1, 1, to); 10221573Srgrimes ASTERN(OOR1, start); 10231573Srgrimes AHEAD(start); /* ... fix it */ 10241573Srgrimes EMIT(OOR2, 0); 10251573Srgrimes AHEAD(THERE()); 10261573Srgrimes ASTERN(O_CH, THERETHERE()); 10271573Srgrimes break; 10281573Srgrimes case REP(1, 1): /* trivial case */ 10291573Srgrimes /* done */ 10301573Srgrimes break; 10311573Srgrimes case REP(1, N): /* as x?x{1,n-1} */ 10321573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10331573Srgrimes INSERT(OCH_, start); 10341573Srgrimes ASTERN(OOR1, start); 10351573Srgrimes AHEAD(start); 10361573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 10371573Srgrimes AHEAD(THERE()); /* ...so fix it */ 10381573Srgrimes ASTERN(O_CH, THERETHERE()); 10391573Srgrimes copy = dupl(p, start+1, finish+1); 10401573Srgrimes assert(copy == finish+4); 10411573Srgrimes repeat(p, copy, 1, to-1); 10421573Srgrimes break; 10431573Srgrimes case REP(1, INF): /* as x+ */ 10441573Srgrimes INSERT(OPLUS_, start); 10451573Srgrimes ASTERN(O_PLUS, start); 10461573Srgrimes break; 10471573Srgrimes case REP(N, N): /* as xx{m-1,n-1} */ 10481573Srgrimes copy = dupl(p, start, finish); 10491573Srgrimes repeat(p, copy, from-1, to-1); 10501573Srgrimes break; 10511573Srgrimes case REP(N, INF): /* as xx{n-1,INF} */ 10521573Srgrimes copy = dupl(p, start, finish); 10531573Srgrimes repeat(p, copy, from-1, to); 10541573Srgrimes break; 10551573Srgrimes default: /* "can't happen" */ 10561573Srgrimes SETERROR(REG_ASSERT); /* just in case */ 10571573Srgrimes break; 10581573Srgrimes } 10591573Srgrimes} 10601573Srgrimes 10611573Srgrimes/* 1062132019Stjr - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1063132019Stjr - character from the parse struct, signals a REG_ILLSEQ error if the 1064132019Stjr - character can't be converted. Returns the number of bytes consumed. 1065132019Stjr */ 1066132019Stjrstatic wint_t 1067170528Sdelphijwgetnext(struct parse *p) 1068132019Stjr{ 1069132019Stjr mbstate_t mbs; 1070132019Stjr wchar_t wc; 1071132019Stjr size_t n; 1072132019Stjr 1073132019Stjr memset(&mbs, 0, sizeof(mbs)); 1074132019Stjr n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1075132019Stjr if (n == (size_t)-1 || n == (size_t)-2) { 1076132019Stjr SETERROR(REG_ILLSEQ); 1077132019Stjr return (0); 1078132019Stjr } 1079132019Stjr if (n == 0) 1080132019Stjr n = 1; 1081132019Stjr p->next += n; 1082132019Stjr return (wc); 1083132019Stjr} 1084132019Stjr 1085132019Stjr/* 10861573Srgrimes - seterr - set an error condition 108792889Sobrien == static int seterr(struct parse *p, int e); 10881573Srgrimes */ 10891573Srgrimesstatic int /* useless but makes type checking happy */ 1090170528Sdelphijseterr(struct parse *p, int e) 10911573Srgrimes{ 10921573Srgrimes if (p->error == 0) /* keep earliest error condition */ 10931573Srgrimes p->error = e; 10941573Srgrimes p->next = nuls; /* try to bring things to a halt */ 10951573Srgrimes p->end = nuls; 10961573Srgrimes return(0); /* make the return value well-defined */ 10971573Srgrimes} 10981573Srgrimes 10991573Srgrimes/* 11001573Srgrimes - allocset - allocate a set of characters for [] 110192889Sobrien == static cset *allocset(struct parse *p); 11021573Srgrimes */ 11031573Srgrimesstatic cset * 1104170528Sdelphijallocset(struct parse *p) 11051573Srgrimes{ 1106132019Stjr cset *cs, *ncs; 11071573Srgrimes 1108132019Stjr ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); 1109132019Stjr if (ncs == NULL) { 1110132019Stjr SETERROR(REG_ESPACE); 1111132019Stjr return (NULL); 11121573Srgrimes } 1113132019Stjr p->g->sets = ncs; 1114132019Stjr cs = &p->g->sets[p->g->ncsets++]; 1115132019Stjr memset(cs, 0, sizeof(*cs)); 11161573Srgrimes 11171573Srgrimes return(cs); 11181573Srgrimes} 11191573Srgrimes 11201573Srgrimes/* 11211573Srgrimes - freeset - free a now-unused set 112292889Sobrien == static void freeset(struct parse *p, cset *cs); 11231573Srgrimes */ 11241573Srgrimesstatic void 1125170528Sdelphijfreeset(struct parse *p, cset *cs) 11261573Srgrimes{ 112792889Sobrien cset *top = &p->g->sets[p->g->ncsets]; 11281573Srgrimes 1129132019Stjr free(cs->wides); 1130132019Stjr free(cs->ranges); 1131132019Stjr free(cs->types); 1132132019Stjr memset(cs, 0, sizeof(*cs)); 11331573Srgrimes if (cs == top-1) /* recover only the easy case */ 11341573Srgrimes p->g->ncsets--; 11351573Srgrimes} 11361573Srgrimes 11371573Srgrimes/* 1138132019Stjr - singleton - Determine whether a set contains only one character, 1139132019Stjr - returning it if so, otherwise returning OUT. 11401573Srgrimes */ 1141132019Stjrstatic wint_t 1142170528Sdelphijsingleton(cset *cs) 11431573Srgrimes{ 1144132019Stjr wint_t i, s, n; 11451573Srgrimes 1146132019Stjr for (i = n = 0; i < NC; i++) 1147132019Stjr if (CHIN(cs, i)) { 1148132019Stjr n++; 1149132019Stjr s = i; 11501573Srgrimes } 1151132019Stjr if (n == 1) 1152132019Stjr return (s); 1153132019Stjr if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1154132019Stjr cs->icase == 0) 1155132019Stjr return (cs->wides[0]); 1156132019Stjr /* Don't bother handling the other cases. */ 1157132019Stjr return (OUT); 1158132019Stjr} 11591573Srgrimes 1160132019Stjr/* 1161132019Stjr - CHadd - add character to character set. 1162132019Stjr */ 1163132019Stjrstatic void 1164170528SdelphijCHadd(struct parse *p, cset *cs, wint_t ch) 1165132019Stjr{ 1166134802Stjr wint_t nch, *newwides; 1167132019Stjr assert(ch >= 0); 1168134802Stjr if (ch < NC) 1169132019Stjr cs->bmp[ch >> 3] |= 1 << (ch & 7); 1170134802Stjr else { 1171132019Stjr newwides = realloc(cs->wides, (cs->nwides + 1) * 1172132019Stjr sizeof(*cs->wides)); 1173132019Stjr if (newwides == NULL) { 1174132019Stjr SETERROR(REG_ESPACE); 1175132019Stjr return; 1176132019Stjr } 1177132019Stjr cs->wides = newwides; 1178132019Stjr cs->wides[cs->nwides++] = ch; 11791573Srgrimes } 1180134802Stjr if (cs->icase) { 1181134802Stjr if ((nch = towlower(ch)) < NC) 1182134802Stjr cs->bmp[nch >> 3] |= 1 << (nch & 7); 1183134802Stjr if ((nch = towupper(ch)) < NC) 1184134802Stjr cs->bmp[nch >> 3] |= 1 << (nch & 7); 1185134802Stjr } 11861573Srgrimes} 11871573Srgrimes 11881573Srgrimes/* 1189132019Stjr - CHaddrange - add all characters in the range [min,max] to a character set. 11901573Srgrimes */ 1191132019Stjrstatic void 1192170528SdelphijCHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 11931573Srgrimes{ 1194132019Stjr crange *newranges; 11951573Srgrimes 1196132019Stjr for (; min < NC && min <= max; min++) 1197132019Stjr CHadd(p, cs, min); 1198132019Stjr if (min >= max) 1199132019Stjr return; 1200132019Stjr newranges = realloc(cs->ranges, (cs->nranges + 1) * 1201132019Stjr sizeof(*cs->ranges)); 1202132019Stjr if (newranges == NULL) { 1203132019Stjr SETERROR(REG_ESPACE); 1204132019Stjr return; 1205132019Stjr } 1206132019Stjr cs->ranges = newranges; 1207132019Stjr cs->ranges[cs->nranges].min = min; 1208132019Stjr cs->ranges[cs->nranges].min = max; 1209132019Stjr cs->nranges++; 12101573Srgrimes} 12111573Srgrimes 12121573Srgrimes/* 1213132019Stjr - CHaddtype - add all characters of a certain type to a character set. 12141573Srgrimes */ 1215132019Stjrstatic void 1216170528SdelphijCHaddtype(struct parse *p, cset *cs, wctype_t wct) 12171573Srgrimes{ 1218132019Stjr wint_t i; 1219132019Stjr wctype_t *newtypes; 12201573Srgrimes 1221134802Stjr for (i = 0; i < NC; i++) 1222132019Stjr if (iswctype(i, wct)) 1223132019Stjr CHadd(p, cs, i); 1224132019Stjr newtypes = realloc(cs->types, (cs->ntypes + 1) * 1225132019Stjr sizeof(*cs->types)); 1226132019Stjr if (newtypes == NULL) { 1227132019Stjr SETERROR(REG_ESPACE); 1228132019Stjr return; 1229132019Stjr } 1230132019Stjr cs->types = newtypes; 1231132019Stjr cs->types[cs->ntypes++] = wct; 12321573Srgrimes} 12331573Srgrimes 12341573Srgrimes/* 12351573Srgrimes - dupl - emit a duplicate of a bunch of sops 123692889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish); 12371573Srgrimes */ 12381573Srgrimesstatic sopno /* start of duplicate */ 1239170528Sdelphijdupl(struct parse *p, 1240170528Sdelphij sopno start, /* from here */ 1241170528Sdelphij sopno finish) /* to this less one */ 12421573Srgrimes{ 124392889Sobrien sopno ret = HERE(); 124492889Sobrien sopno len = finish - start; 12451573Srgrimes 12461573Srgrimes assert(finish >= start); 12471573Srgrimes if (len == 0) 12481573Srgrimes return(ret); 1249227414Skevlo if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1250227414Skevlo return(ret); 12511573Srgrimes (void) memcpy((char *)(p->strip + p->slen), 12521573Srgrimes (char *)(p->strip + start), (size_t)len*sizeof(sop)); 12531573Srgrimes p->slen += len; 12541573Srgrimes return(ret); 12551573Srgrimes} 12561573Srgrimes 12571573Srgrimes/* 12581573Srgrimes - doemit - emit a strip operator 125992889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd); 12601573Srgrimes * 12611573Srgrimes * It might seem better to implement this as a macro with a function as 12621573Srgrimes * hard-case backup, but it's just too big and messy unless there are 12631573Srgrimes * some changes to the data structures. Maybe later. 12641573Srgrimes */ 12651573Srgrimesstatic void 1266170528Sdelphijdoemit(struct parse *p, sop op, size_t opnd) 12671573Srgrimes{ 12681573Srgrimes /* avoid making error situations worse */ 12691573Srgrimes if (p->error != 0) 12701573Srgrimes return; 12711573Srgrimes 12721573Srgrimes /* deal with oversize operands ("can't happen", more or less) */ 12731573Srgrimes assert(opnd < 1<<OPSHIFT); 12741573Srgrimes 12751573Srgrimes /* deal with undersized strip */ 12761573Srgrimes if (p->slen >= p->ssize) 1277227414Skevlo if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1278227414Skevlo return; 12791573Srgrimes 12801573Srgrimes /* finally, it's all reduced to the easy case */ 12811573Srgrimes p->strip[p->slen++] = SOP(op, opnd); 12821573Srgrimes} 12831573Srgrimes 12841573Srgrimes/* 12851573Srgrimes - doinsert - insert a sop into the strip 128692889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 12871573Srgrimes */ 12881573Srgrimesstatic void 1289170528Sdelphijdoinsert(struct parse *p, sop op, size_t opnd, sopno pos) 12901573Srgrimes{ 129192889Sobrien sopno sn; 129292889Sobrien sop s; 129392889Sobrien int i; 12941573Srgrimes 12951573Srgrimes /* avoid making error situations worse */ 12961573Srgrimes if (p->error != 0) 12971573Srgrimes return; 12981573Srgrimes 12991573Srgrimes sn = HERE(); 13001573Srgrimes EMIT(op, opnd); /* do checks, ensure space */ 13011573Srgrimes assert(HERE() == sn+1); 13021573Srgrimes s = p->strip[sn]; 13031573Srgrimes 13041573Srgrimes /* adjust paren pointers */ 13051573Srgrimes assert(pos > 0); 13061573Srgrimes for (i = 1; i < NPAREN; i++) { 13071573Srgrimes if (p->pbegin[i] >= pos) { 13081573Srgrimes p->pbegin[i]++; 13091573Srgrimes } 13101573Srgrimes if (p->pend[i] >= pos) { 13111573Srgrimes p->pend[i]++; 13121573Srgrimes } 13131573Srgrimes } 13141573Srgrimes 13151573Srgrimes memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 13161573Srgrimes (HERE()-pos-1)*sizeof(sop)); 13171573Srgrimes p->strip[pos] = s; 13181573Srgrimes} 13191573Srgrimes 13201573Srgrimes/* 13211573Srgrimes - dofwd - complete a forward reference 132292889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value); 13231573Srgrimes */ 13241573Srgrimesstatic void 1325170528Sdelphijdofwd(struct parse *p, sopno pos, sop value) 13261573Srgrimes{ 13271573Srgrimes /* avoid making error situations worse */ 13281573Srgrimes if (p->error != 0) 13291573Srgrimes return; 13301573Srgrimes 13311573Srgrimes assert(value < 1<<OPSHIFT); 13321573Srgrimes p->strip[pos] = OP(p->strip[pos]) | value; 13331573Srgrimes} 13341573Srgrimes 13351573Srgrimes/* 13361573Srgrimes - enlarge - enlarge the strip 1337227414Skevlo == static int enlarge(struct parse *p, sopno size); 13381573Srgrimes */ 1339227414Skevlostatic int 1340170528Sdelphijenlarge(struct parse *p, sopno size) 13411573Srgrimes{ 134292889Sobrien sop *sp; 13431573Srgrimes 13441573Srgrimes if (p->ssize >= size) 1345227414Skevlo return 1; 13461573Srgrimes 13471573Srgrimes sp = (sop *)realloc(p->strip, size*sizeof(sop)); 13481573Srgrimes if (sp == NULL) { 13491573Srgrimes SETERROR(REG_ESPACE); 1350227414Skevlo return 0; 13511573Srgrimes } 13521573Srgrimes p->strip = sp; 13531573Srgrimes p->ssize = size; 1354227414Skevlo return 1; 13551573Srgrimes} 13561573Srgrimes 13571573Srgrimes/* 13581573Srgrimes - stripsnug - compact the strip 135992889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g); 13601573Srgrimes */ 13611573Srgrimesstatic void 1362170528Sdelphijstripsnug(struct parse *p, struct re_guts *g) 13631573Srgrimes{ 13641573Srgrimes g->nstates = p->slen; 13651573Srgrimes g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 13661573Srgrimes if (g->strip == NULL) { 13671573Srgrimes SETERROR(REG_ESPACE); 13681573Srgrimes g->strip = p->strip; 13691573Srgrimes } 13701573Srgrimes} 13711573Srgrimes 13721573Srgrimes/* 13731573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string 137492889Sobrien == static void findmust(struct parse *p, struct re_guts *g); 13751573Srgrimes * 13761573Srgrimes * This algorithm could do fancy things like analyzing the operands of | 13771573Srgrimes * for common subsequences. Someday. This code is simple and finds most 13781573Srgrimes * of the interesting cases. 13791573Srgrimes * 13801573Srgrimes * Note that must and mlen got initialized during setup. 13811573Srgrimes */ 13821573Srgrimesstatic void 1383170528Sdelphijfindmust(struct parse *p, struct re_guts *g) 13841573Srgrimes{ 138592889Sobrien sop *scan; 13861573Srgrimes sop *start; 138792889Sobrien sop *newstart; 138892889Sobrien sopno newlen; 138992889Sobrien sop s; 139092889Sobrien char *cp; 139162391Sdcs int offset; 1392132019Stjr char buf[MB_LEN_MAX]; 1393132019Stjr size_t clen; 1394132019Stjr mbstate_t mbs; 13951573Srgrimes 13961573Srgrimes /* avoid making error situations worse */ 13971573Srgrimes if (p->error != 0) 13981573Srgrimes return; 13991573Srgrimes 1400132019Stjr /* 1401132019Stjr * It's not generally safe to do a ``char'' substring search on 1402132019Stjr * multibyte character strings, but it's safe for at least 1403132019Stjr * UTF-8 (see RFC 3629). 1404132019Stjr */ 1405132019Stjr if (MB_CUR_MAX > 1 && 1406132019Stjr strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1407132019Stjr return; 1408132019Stjr 14091573Srgrimes /* find the longest OCHAR sequence in strip */ 14101573Srgrimes newlen = 0; 141162391Sdcs offset = 0; 141262391Sdcs g->moffset = 0; 14131573Srgrimes scan = g->strip + 1; 14141573Srgrimes do { 14151573Srgrimes s = *scan++; 14161573Srgrimes switch (OP(s)) { 14171573Srgrimes case OCHAR: /* sequence member */ 1418132019Stjr if (newlen == 0) { /* new sequence */ 1419132019Stjr memset(&mbs, 0, sizeof(mbs)); 14201573Srgrimes newstart = scan - 1; 1421132019Stjr } 1422132019Stjr clen = wcrtomb(buf, OPND(s), &mbs); 1423132019Stjr if (clen == (size_t)-1) 1424132019Stjr goto toohard; 1425132019Stjr newlen += clen; 14261573Srgrimes break; 14271573Srgrimes case OPLUS_: /* things that don't break one */ 14281573Srgrimes case OLPAREN: 14291573Srgrimes case ORPAREN: 14301573Srgrimes break; 14311573Srgrimes case OQUEST_: /* things that must be skipped */ 14321573Srgrimes case OCH_: 1433131973Stjr offset = altoffset(scan, offset); 14341573Srgrimes scan--; 14351573Srgrimes do { 14361573Srgrimes scan += OPND(s); 14371573Srgrimes s = *scan; 14381573Srgrimes /* assert() interferes w debug printouts */ 14391573Srgrimes if (OP(s) != O_QUEST && OP(s) != O_CH && 14401573Srgrimes OP(s) != OOR2) { 14411573Srgrimes g->iflags |= BAD; 14421573Srgrimes return; 14431573Srgrimes } 14441573Srgrimes } while (OP(s) != O_QUEST && OP(s) != O_CH); 1445102411Scharnier /* FALLTHROUGH */ 144662391Sdcs case OBOW: /* things that break a sequence */ 144762391Sdcs case OEOW: 144862391Sdcs case OBOL: 144962391Sdcs case OEOL: 145062391Sdcs case O_QUEST: 145162391Sdcs case O_CH: 145262391Sdcs case OEND: 14531573Srgrimes if (newlen > g->mlen) { /* ends one */ 14541573Srgrimes start = newstart; 14551573Srgrimes g->mlen = newlen; 145662391Sdcs if (offset > -1) { 145762391Sdcs g->moffset += offset; 145862391Sdcs offset = newlen; 145962391Sdcs } else 146062391Sdcs g->moffset = offset; 146162391Sdcs } else { 146262391Sdcs if (offset > -1) 146362391Sdcs offset += newlen; 14641573Srgrimes } 14651573Srgrimes newlen = 0; 14661573Srgrimes break; 146762391Sdcs case OANY: 146862391Sdcs if (newlen > g->mlen) { /* ends one */ 146962391Sdcs start = newstart; 147062391Sdcs g->mlen = newlen; 147162391Sdcs if (offset > -1) { 147262391Sdcs g->moffset += offset; 147362391Sdcs offset = newlen; 147462391Sdcs } else 147562391Sdcs g->moffset = offset; 147662391Sdcs } else { 147762391Sdcs if (offset > -1) 147862391Sdcs offset += newlen; 147962391Sdcs } 148062391Sdcs if (offset > -1) 148162391Sdcs offset++; 148262391Sdcs newlen = 0; 148362391Sdcs break; 148462391Sdcs case OANYOF: /* may or may not invalidate offset */ 148562391Sdcs /* First, everything as OANY */ 148662391Sdcs if (newlen > g->mlen) { /* ends one */ 148762391Sdcs start = newstart; 148862391Sdcs g->mlen = newlen; 148962391Sdcs if (offset > -1) { 149062391Sdcs g->moffset += offset; 149162391Sdcs offset = newlen; 149262391Sdcs } else 149362391Sdcs g->moffset = offset; 149462391Sdcs } else { 149562391Sdcs if (offset > -1) 149662391Sdcs offset += newlen; 149762391Sdcs } 149862391Sdcs if (offset > -1) 149962391Sdcs offset++; 150062391Sdcs newlen = 0; 150162391Sdcs break; 1502132019Stjr toohard: 150362391Sdcs default: 150462391Sdcs /* Anything here makes it impossible or too hard 150562391Sdcs * to calculate the offset -- so we give up; 150662391Sdcs * save the last known good offset, in case the 150762391Sdcs * must sequence doesn't occur later. 150862391Sdcs */ 150962391Sdcs if (newlen > g->mlen) { /* ends one */ 151062391Sdcs start = newstart; 151162391Sdcs g->mlen = newlen; 151262391Sdcs if (offset > -1) 151362391Sdcs g->moffset += offset; 151462391Sdcs else 151562391Sdcs g->moffset = offset; 151662391Sdcs } 151762391Sdcs offset = -1; 151862391Sdcs newlen = 0; 151962391Sdcs break; 15201573Srgrimes } 15211573Srgrimes } while (OP(s) != OEND); 15221573Srgrimes 152362391Sdcs if (g->mlen == 0) { /* there isn't one */ 152462391Sdcs g->moffset = -1; 15251573Srgrimes return; 152662391Sdcs } 15271573Srgrimes 15281573Srgrimes /* turn it into a character string */ 15291573Srgrimes g->must = malloc((size_t)g->mlen + 1); 15301573Srgrimes if (g->must == NULL) { /* argh; just forget it */ 15311573Srgrimes g->mlen = 0; 153262391Sdcs g->moffset = -1; 15331573Srgrimes return; 15341573Srgrimes } 15351573Srgrimes cp = g->must; 15361573Srgrimes scan = start; 1537132019Stjr memset(&mbs, 0, sizeof(mbs)); 1538132019Stjr while (cp < g->must + g->mlen) { 15391573Srgrimes while (OP(s = *scan++) != OCHAR) 15401573Srgrimes continue; 1541132019Stjr clen = wcrtomb(cp, OPND(s), &mbs); 1542132019Stjr assert(clen != (size_t)-1); 1543132019Stjr cp += clen; 15441573Srgrimes } 15451573Srgrimes assert(cp == g->must + g->mlen); 15461573Srgrimes *cp++ = '\0'; /* just on general principles */ 15471573Srgrimes} 15481573Srgrimes 15491573Srgrimes/* 155062391Sdcs - altoffset - choose biggest offset among multiple choices 1551131973Stjr == static int altoffset(sop *scan, int offset); 155262391Sdcs * 155362391Sdcs * Compute, recursively if necessary, the largest offset among multiple 155462391Sdcs * re paths. 155562391Sdcs */ 155662391Sdcsstatic int 1557170528Sdelphijaltoffset(sop *scan, int offset) 155862391Sdcs{ 155962391Sdcs int largest; 156062391Sdcs int try; 156162391Sdcs sop s; 156262391Sdcs 156362391Sdcs /* If we gave up already on offsets, return */ 156462391Sdcs if (offset == -1) 156562391Sdcs return -1; 156662391Sdcs 156762391Sdcs largest = 0; 156862391Sdcs try = 0; 156962391Sdcs s = *scan++; 157062391Sdcs while (OP(s) != O_QUEST && OP(s) != O_CH) { 157162391Sdcs switch (OP(s)) { 157262391Sdcs case OOR1: 157362391Sdcs if (try > largest) 157462391Sdcs largest = try; 157562391Sdcs try = 0; 157662391Sdcs break; 157762391Sdcs case OQUEST_: 157862391Sdcs case OCH_: 1579131973Stjr try = altoffset(scan, try); 158062391Sdcs if (try == -1) 158162391Sdcs return -1; 158262391Sdcs scan--; 158362391Sdcs do { 158462391Sdcs scan += OPND(s); 158562391Sdcs s = *scan; 158662391Sdcs if (OP(s) != O_QUEST && OP(s) != O_CH && 158762391Sdcs OP(s) != OOR2) 158862391Sdcs return -1; 158962391Sdcs } while (OP(s) != O_QUEST && OP(s) != O_CH); 159062855Sdcs /* We must skip to the next position, or we'll 159162855Sdcs * leave altoffset() too early. 159262855Sdcs */ 159362855Sdcs scan++; 159462391Sdcs break; 159562391Sdcs case OANYOF: 159662391Sdcs case OCHAR: 159762391Sdcs case OANY: 159862391Sdcs try++; 159962391Sdcs case OBOW: 160062391Sdcs case OEOW: 160162391Sdcs case OLPAREN: 160262391Sdcs case ORPAREN: 160362391Sdcs case OOR2: 160462391Sdcs break; 160562391Sdcs default: 160662391Sdcs try = -1; 160762391Sdcs break; 160862391Sdcs } 160962391Sdcs if (try == -1) 161062391Sdcs return -1; 161162391Sdcs s = *scan++; 161262391Sdcs } 161362391Sdcs 161462391Sdcs if (try > largest) 161562391Sdcs largest = try; 161662391Sdcs 161762391Sdcs return largest+offset; 161862391Sdcs} 161962391Sdcs 162062391Sdcs/* 162162232Sdcs - computejumps - compute char jumps for BM scan 162292889Sobrien == static void computejumps(struct parse *p, struct re_guts *g); 162362232Sdcs * 162462232Sdcs * This algorithm assumes g->must exists and is has size greater than 162562232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 162662232Sdcs * Sara Baase. 162762232Sdcs * 162862232Sdcs * A char jump is the number of characters one needs to jump based on 162962232Sdcs * the value of the character from the text that was mismatched. 163062232Sdcs */ 163162232Sdcsstatic void 1632170528Sdelphijcomputejumps(struct parse *p, struct re_guts *g) 163362232Sdcs{ 163462232Sdcs int ch; 163562232Sdcs int mindex; 163662232Sdcs 163762232Sdcs /* Avoid making errors worse */ 163862232Sdcs if (p->error != 0) 163962232Sdcs return; 164062232Sdcs 164162848Sdcs g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 164262232Sdcs if (g->charjump == NULL) /* Not a fatal error */ 164362232Sdcs return; 164462754Sdcs /* Adjust for signed chars, if necessary */ 164562754Sdcs g->charjump = &g->charjump[-(CHAR_MIN)]; 164662232Sdcs 164762232Sdcs /* If the character does not exist in the pattern, the jump 164862232Sdcs * is equal to the number of characters in the pattern. 164962232Sdcs */ 165062754Sdcs for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 165162232Sdcs g->charjump[ch] = g->mlen; 165262232Sdcs 165362232Sdcs /* If the character does exist, compute the jump that would 165462232Sdcs * take us to the last character in the pattern equal to it 165562232Sdcs * (notice that we match right to left, so that last character 165662232Sdcs * is the first one that would be matched). 165762232Sdcs */ 165862232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 1659111010Snectar g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 166062232Sdcs} 166162232Sdcs 166262232Sdcs/* 166362232Sdcs - computematchjumps - compute match jumps for BM scan 166492889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g); 166562232Sdcs * 166662232Sdcs * This algorithm assumes g->must exists and is has size greater than 166762232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 166862232Sdcs * Sara Baase. 166962232Sdcs * 167062232Sdcs * A match jump is the number of characters one needs to advance based 167162232Sdcs * on the already-matched suffix. 167262232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way 167362232Sdcs * the search algorithm works. 167462232Sdcs */ 167562232Sdcsstatic void 1676170528Sdelphijcomputematchjumps(struct parse *p, struct re_guts *g) 167762232Sdcs{ 167862232Sdcs int mindex; /* General "must" iterator */ 167962232Sdcs int suffix; /* Keeps track of matching suffix */ 168062232Sdcs int ssuffix; /* Keeps track of suffixes' suffix */ 168162232Sdcs int* pmatches; /* pmatches[k] points to the next i 168262232Sdcs * such that i+1...mlen is a substring 168362232Sdcs * of k+1...k+mlen-i-1 168462232Sdcs */ 168562232Sdcs 168662232Sdcs /* Avoid making errors worse */ 168762232Sdcs if (p->error != 0) 168862232Sdcs return; 168962232Sdcs 169062848Sdcs pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 169162232Sdcs if (pmatches == NULL) { 169262232Sdcs g->matchjump = NULL; 169362232Sdcs return; 169462232Sdcs } 169562232Sdcs 169662848Sdcs g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 169762232Sdcs if (g->matchjump == NULL) /* Not a fatal error */ 169862232Sdcs return; 169962232Sdcs 170062232Sdcs /* Set maximum possible jump for each character in the pattern */ 170162232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 170262232Sdcs g->matchjump[mindex] = 2*g->mlen - mindex - 1; 170362232Sdcs 170462232Sdcs /* Compute pmatches[] */ 170562232Sdcs for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 170662232Sdcs mindex--, suffix--) { 170762232Sdcs pmatches[mindex] = suffix; 170862232Sdcs 170962232Sdcs /* If a mismatch is found, interrupting the substring, 171062232Sdcs * compute the matchjump for that position. If no 171162232Sdcs * mismatch is found, then a text substring mismatched 171262232Sdcs * against the suffix will also mismatch against the 171362232Sdcs * substring. 171462232Sdcs */ 171562232Sdcs while (suffix < g->mlen 171662232Sdcs && g->must[mindex] != g->must[suffix]) { 171762232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 171862232Sdcs g->mlen - mindex - 1); 171962232Sdcs suffix = pmatches[suffix]; 172062232Sdcs } 172162232Sdcs } 172262232Sdcs 172362232Sdcs /* Compute the matchjump up to the last substring found to jump 172462232Sdcs * to the beginning of the largest must pattern prefix matching 172562232Sdcs * it's own suffix. 172662232Sdcs */ 172762232Sdcs for (mindex = 0; mindex <= suffix; mindex++) 172862232Sdcs g->matchjump[mindex] = MIN(g->matchjump[mindex], 172962232Sdcs g->mlen + suffix - mindex); 173062232Sdcs 173162232Sdcs ssuffix = pmatches[suffix]; 173262391Sdcs while (suffix < g->mlen) { 173362673Sdcs while (suffix <= ssuffix && suffix < g->mlen) { 173462232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 173562232Sdcs g->mlen + ssuffix - suffix); 173662232Sdcs suffix++; 173762232Sdcs } 173886208Sdcs if (suffix < g->mlen) 173986208Sdcs ssuffix = pmatches[ssuffix]; 174062232Sdcs } 174162232Sdcs 174262232Sdcs free(pmatches); 174362232Sdcs} 174462232Sdcs 174562232Sdcs/* 17461573Srgrimes - pluscount - count + nesting 174792889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g); 17481573Srgrimes */ 17491573Srgrimesstatic sopno /* nesting depth */ 1750170528Sdelphijpluscount(struct parse *p, struct re_guts *g) 17511573Srgrimes{ 175292889Sobrien sop *scan; 175392889Sobrien sop s; 175492889Sobrien sopno plusnest = 0; 175592889Sobrien sopno maxnest = 0; 17561573Srgrimes 17571573Srgrimes if (p->error != 0) 17581573Srgrimes return(0); /* there may not be an OEND */ 17591573Srgrimes 17601573Srgrimes scan = g->strip + 1; 17611573Srgrimes do { 17621573Srgrimes s = *scan++; 17631573Srgrimes switch (OP(s)) { 17641573Srgrimes case OPLUS_: 17651573Srgrimes plusnest++; 17661573Srgrimes break; 17671573Srgrimes case O_PLUS: 17681573Srgrimes if (plusnest > maxnest) 17691573Srgrimes maxnest = plusnest; 17701573Srgrimes plusnest--; 17711573Srgrimes break; 17721573Srgrimes } 17731573Srgrimes } while (OP(s) != OEND); 17741573Srgrimes if (plusnest != 0) 17751573Srgrimes g->iflags |= BAD; 17761573Srgrimes return(maxnest); 17771573Srgrimes} 1778