11573Srgrimes/*- 21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer. 31573Srgrimes * Copyright (c) 1992, 1993, 1994 41573Srgrimes * The Regents of the University of California. All rights reserved. 51573Srgrimes * 6227753Stheraven * Copyright (c) 2011 The FreeBSD Foundation 7227753Stheraven * All rights reserved. 8227753Stheraven * Portions of this software were developed by David Chisnall 9227753Stheraven * under sponsorship from the FreeBSD Foundation. 10227753Stheraven * 111573Srgrimes * This code is derived from software contributed to Berkeley by 121573Srgrimes * Henry Spencer. 131573Srgrimes * 141573Srgrimes * Redistribution and use in source and binary forms, with or without 151573Srgrimes * modification, are permitted provided that the following conditions 161573Srgrimes * are met: 171573Srgrimes * 1. Redistributions of source code must retain the above copyright 181573Srgrimes * notice, this list of conditions and the following disclaimer. 191573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 201573Srgrimes * notice, this list of conditions and the following disclaimer in the 211573Srgrimes * documentation and/or other materials provided with the distribution. 221573Srgrimes * 4. Neither the name of the University nor the names of its contributors 231573Srgrimes * may be used to endorse or promote products derived from this software 241573Srgrimes * without specific prior written permission. 251573Srgrimes * 261573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 271573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 281573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 291573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 301573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 311573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 321573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 331573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 341573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 351573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 361573Srgrimes * SUCH DAMAGE. 371573Srgrimes * 381573Srgrimes * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 391573Srgrimes */ 401573Srgrimes 411573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 421573Srgrimesstatic char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 431573Srgrimes#endif /* LIBC_SCCS and not lint */ 4492986Sobrien#include <sys/cdefs.h> 4592986Sobrien__FBSDID("$FreeBSD$"); 461573Srgrimes 471573Srgrimes#include <sys/types.h> 481573Srgrimes#include <stdio.h> 491573Srgrimes#include <string.h> 501573Srgrimes#include <ctype.h> 511573Srgrimes#include <limits.h> 521573Srgrimes#include <stdlib.h> 531573Srgrimes#include <regex.h> 54136091Sstefanf#include <runetype.h> 55132019Stjr#include <wchar.h> 56132019Stjr#include <wctype.h> 571573Srgrimes 5819277Sache#include "collate.h" 5919277Sache 601573Srgrimes#include "utils.h" 611573Srgrimes#include "regex2.h" 621573Srgrimes 631573Srgrimes#include "cname.h" 641573Srgrimes 651573Srgrimes/* 661573Srgrimes * parse structure, passed up and down to avoid global variables and 671573Srgrimes * other clumsinesses 681573Srgrimes */ 691573Srgrimesstruct parse { 701573Srgrimes char *next; /* next character in RE */ 711573Srgrimes char *end; /* end of string (-> NUL normally) */ 721573Srgrimes int error; /* has an error been seen? */ 731573Srgrimes sop *strip; /* malloced strip */ 741573Srgrimes sopno ssize; /* malloced strip size (allocated) */ 751573Srgrimes sopno slen; /* malloced strip length (used) */ 761573Srgrimes int ncsalloc; /* number of csets allocated */ 771573Srgrimes struct re_guts *g; 781573Srgrimes# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 791573Srgrimes sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 801573Srgrimes sopno pend[NPAREN]; /* -> ) ([0] unused) */ 811573Srgrimes}; 821573Srgrimes 831573Srgrimes/* ========= begin header generated by ./mkh ========= */ 841573Srgrimes#ifdef __cplusplus 851573Srgrimesextern "C" { 861573Srgrimes#endif 871573Srgrimes 881573Srgrimes/* === regcomp.c === */ 89227435Skevlostatic void p_ere(struct parse *p, int stop); 9092905Sobrienstatic void p_ere_exp(struct parse *p); 9192905Sobrienstatic void p_str(struct parse *p); 92227435Skevlostatic void p_bre(struct parse *p, int end1, int end2); 9392905Sobrienstatic int p_simp_re(struct parse *p, int starordinary); 9492905Sobrienstatic int p_count(struct parse *p); 9592905Sobrienstatic void p_bracket(struct parse *p); 9692905Sobrienstatic void p_b_term(struct parse *p, cset *cs); 9792905Sobrienstatic void p_b_cclass(struct parse *p, cset *cs); 9892905Sobrienstatic void p_b_eclass(struct parse *p, cset *cs); 99132019Stjrstatic wint_t p_b_symbol(struct parse *p); 100132019Stjrstatic wint_t p_b_coll_elem(struct parse *p, wint_t endc); 101132019Stjrstatic wint_t othercase(wint_t ch); 102132019Stjrstatic void bothcases(struct parse *p, wint_t ch); 103132019Stjrstatic void ordinary(struct parse *p, wint_t ch); 10492905Sobrienstatic void nonnewline(struct parse *p); 10592905Sobrienstatic void repeat(struct parse *p, sopno start, int from, int to); 10692905Sobrienstatic int seterr(struct parse *p, int e); 10792905Sobrienstatic cset *allocset(struct parse *p); 10892905Sobrienstatic void freeset(struct parse *p, cset *cs); 109132019Stjrstatic void CHadd(struct parse *p, cset *cs, wint_t ch); 110132019Stjrstatic void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 111132019Stjrstatic void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 112132019Stjrstatic wint_t singleton(cset *cs); 11392905Sobrienstatic sopno dupl(struct parse *p, sopno start, sopno finish); 11492905Sobrienstatic void doemit(struct parse *p, sop op, size_t opnd); 11592905Sobrienstatic void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 11692905Sobrienstatic void dofwd(struct parse *p, sopno pos, sop value); 117227414Skevlostatic int enlarge(struct parse *p, sopno size); 11892905Sobrienstatic void stripsnug(struct parse *p, struct re_guts *g); 11992905Sobrienstatic void findmust(struct parse *p, struct re_guts *g); 120131973Stjrstatic int altoffset(sop *scan, int offset); 12192905Sobrienstatic void computejumps(struct parse *p, struct re_guts *g); 12292905Sobrienstatic void computematchjumps(struct parse *p, struct re_guts *g); 12392905Sobrienstatic sopno pluscount(struct parse *p, struct re_guts *g); 124132019Stjrstatic wint_t wgetnext(struct parse *p); 1251573Srgrimes 1261573Srgrimes#ifdef __cplusplus 1271573Srgrimes} 1281573Srgrimes#endif 1291573Srgrimes/* ========= end header generated by ./mkh ========= */ 1301573Srgrimes 1311573Srgrimesstatic char nuls[10]; /* place to point scanner in event of error */ 1321573Srgrimes 1331573Srgrimes/* 1341573Srgrimes * macros for use with parse structure 1351573Srgrimes * BEWARE: these know that the parse structure is named `p' !!! 1361573Srgrimes */ 1371573Srgrimes#define PEEK() (*p->next) 1381573Srgrimes#define PEEK2() (*(p->next+1)) 1391573Srgrimes#define MORE() (p->next < p->end) 1401573Srgrimes#define MORE2() (p->next+1 < p->end) 1411573Srgrimes#define SEE(c) (MORE() && PEEK() == (c)) 1421573Srgrimes#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 1431573Srgrimes#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 1441573Srgrimes#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 1451573Srgrimes#define NEXT() (p->next++) 1461573Srgrimes#define NEXT2() (p->next += 2) 1471573Srgrimes#define NEXTn(n) (p->next += (n)) 1481573Srgrimes#define GETNEXT() (*p->next++) 149132019Stjr#define WGETNEXT() wgetnext(p) 1501573Srgrimes#define SETERROR(e) seterr(p, (e)) 1511573Srgrimes#define REQUIRE(co, e) ((co) || SETERROR(e)) 1521573Srgrimes#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 1531573Srgrimes#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 1541573Srgrimes#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 1551573Srgrimes#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 1561573Srgrimes#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 1571573Srgrimes#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 1581573Srgrimes#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 1591573Srgrimes#define HERE() (p->slen) 1601573Srgrimes#define THERE() (p->slen - 1) 1611573Srgrimes#define THERETHERE() (p->slen - 2) 1621573Srgrimes#define DROP(n) (p->slen -= (n)) 1631573Srgrimes 1641573Srgrimes#ifndef NDEBUG 1651573Srgrimesstatic int never = 0; /* for use in asserts; shuts lint up */ 1661573Srgrimes#else 1671573Srgrimes#define never 0 /* some <assert.h>s have bugs too */ 1681573Srgrimes#endif 1691573Srgrimes 17062232Sdcs/* Macro used by computejump()/computematchjump() */ 17162232Sdcs#define MIN(a,b) ((a)<(b)?(a):(b)) 17262232Sdcs 1731573Srgrimes/* 1741573Srgrimes - regcomp - interface for parser and compilation 1751573Srgrimes = extern int regcomp(regex_t *, const char *, int); 1761573Srgrimes = #define REG_BASIC 0000 1771573Srgrimes = #define REG_EXTENDED 0001 1781573Srgrimes = #define REG_ICASE 0002 1791573Srgrimes = #define REG_NOSUB 0004 1801573Srgrimes = #define REG_NEWLINE 0010 1811573Srgrimes = #define REG_NOSPEC 0020 1821573Srgrimes = #define REG_PEND 0040 1831573Srgrimes = #define REG_DUMP 0200 1841573Srgrimes */ 1851573Srgrimesint /* 0 success, otherwise REG_something */ 186170528Sdelphijregcomp(regex_t * __restrict preg, 187170528Sdelphij const char * __restrict pattern, 188170528Sdelphij int cflags) 1891573Srgrimes{ 1901573Srgrimes struct parse pa; 19192889Sobrien struct re_guts *g; 19292889Sobrien struct parse *p = &pa; 19392889Sobrien int i; 19492889Sobrien size_t len; 1951573Srgrimes#ifdef REDEBUG 1961573Srgrimes# define GOODFLAGS(f) (f) 1971573Srgrimes#else 1981573Srgrimes# define GOODFLAGS(f) ((f)&~REG_DUMP) 1991573Srgrimes#endif 2001573Srgrimes 2011573Srgrimes cflags = GOODFLAGS(cflags); 2021573Srgrimes if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 2031573Srgrimes return(REG_INVARG); 2041573Srgrimes 2051573Srgrimes if (cflags®_PEND) { 2061573Srgrimes if (preg->re_endp < pattern) 2071573Srgrimes return(REG_INVARG); 2081573Srgrimes len = preg->re_endp - pattern; 2091573Srgrimes } else 2101573Srgrimes len = strlen((char *)pattern); 2111573Srgrimes 2121573Srgrimes /* do the mallocs early so failure handling is easy */ 213131973Stjr g = (struct re_guts *)malloc(sizeof(struct re_guts)); 2141573Srgrimes if (g == NULL) 2151573Srgrimes return(REG_ESPACE); 2161573Srgrimes p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 2171573Srgrimes p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 2181573Srgrimes p->slen = 0; 2191573Srgrimes if (p->strip == NULL) { 2201573Srgrimes free((char *)g); 2211573Srgrimes return(REG_ESPACE); 2221573Srgrimes } 2231573Srgrimes 2241573Srgrimes /* set things up */ 2251573Srgrimes p->g = g; 2261573Srgrimes p->next = (char *)pattern; /* convenience; we do not modify it */ 2271573Srgrimes p->end = p->next + len; 2281573Srgrimes p->error = 0; 2291573Srgrimes p->ncsalloc = 0; 2301573Srgrimes for (i = 0; i < NPAREN; i++) { 2311573Srgrimes p->pbegin[i] = 0; 2321573Srgrimes p->pend[i] = 0; 2331573Srgrimes } 2341573Srgrimes g->sets = NULL; 2351573Srgrimes g->ncsets = 0; 2361573Srgrimes g->cflags = cflags; 2371573Srgrimes g->iflags = 0; 2381573Srgrimes g->nbol = 0; 2391573Srgrimes g->neol = 0; 2401573Srgrimes g->must = NULL; 24162391Sdcs g->moffset = -1; 24262263Sdcs g->charjump = NULL; 24362263Sdcs g->matchjump = NULL; 2441573Srgrimes g->mlen = 0; 2451573Srgrimes g->nsub = 0; 2461573Srgrimes g->backrefs = 0; 2471573Srgrimes 2481573Srgrimes /* do it */ 2491573Srgrimes EMIT(OEND, 0); 2501573Srgrimes g->firststate = THERE(); 2511573Srgrimes if (cflags®_EXTENDED) 2521573Srgrimes p_ere(p, OUT); 2531573Srgrimes else if (cflags®_NOSPEC) 2541573Srgrimes p_str(p); 2551573Srgrimes else 2561573Srgrimes p_bre(p, OUT, OUT); 2571573Srgrimes EMIT(OEND, 0); 2581573Srgrimes g->laststate = THERE(); 2591573Srgrimes 2601573Srgrimes /* tidy up loose ends and fill things in */ 2611573Srgrimes stripsnug(p, g); 2621573Srgrimes findmust(p, g); 26362232Sdcs /* only use Boyer-Moore algorithm if the pattern is bigger 26462232Sdcs * than three characters 26562232Sdcs */ 26662232Sdcs if(g->mlen > 3) { 26762232Sdcs computejumps(p, g); 26862232Sdcs computematchjumps(p, g); 26962755Sdcs if(g->matchjump == NULL && g->charjump != NULL) { 27062232Sdcs free(g->charjump); 27162232Sdcs g->charjump = NULL; 27262232Sdcs } 27362232Sdcs } 2741573Srgrimes g->nplus = pluscount(p, g); 2751573Srgrimes g->magic = MAGIC2; 2761573Srgrimes preg->re_nsub = g->nsub; 2771573Srgrimes preg->re_g = g; 2781573Srgrimes preg->re_magic = MAGIC1; 2791573Srgrimes#ifndef REDEBUG 2801573Srgrimes /* not debugging, so can't rely on the assert() in regexec() */ 2811573Srgrimes if (g->iflags&BAD) 2821573Srgrimes SETERROR(REG_ASSERT); 2831573Srgrimes#endif 2841573Srgrimes 2851573Srgrimes /* win or lose, we're done */ 2861573Srgrimes if (p->error != 0) /* lose */ 2871573Srgrimes regfree(preg); 2881573Srgrimes return(p->error); 2891573Srgrimes} 2901573Srgrimes 2911573Srgrimes/* 2921573Srgrimes - p_ere - ERE parser top level, concatenation and alternation 293227435Skevlo == static void p_ere(struct parse *p, int_t stop); 2941573Srgrimes */ 2951573Srgrimesstatic void 296170528Sdelphijp_ere(struct parse *p, 297227435Skevlo int stop) /* character this ERE should end at */ 2981573Srgrimes{ 29992889Sobrien char c; 30092889Sobrien sopno prevback; 30192889Sobrien sopno prevfwd; 30292889Sobrien sopno conc; 30392889Sobrien int first = 1; /* is this the first alternative? */ 3041573Srgrimes 3051573Srgrimes for (;;) { 3061573Srgrimes /* do a bunch of concatenated expressions */ 3071573Srgrimes conc = HERE(); 3081573Srgrimes while (MORE() && (c = PEEK()) != '|' && c != stop) 3091573Srgrimes p_ere_exp(p); 31017141Sjkh (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 3111573Srgrimes 3121573Srgrimes if (!EAT('|')) 3131573Srgrimes break; /* NOTE BREAK OUT */ 3141573Srgrimes 3151573Srgrimes if (first) { 3161573Srgrimes INSERT(OCH_, conc); /* offset is wrong */ 3171573Srgrimes prevfwd = conc; 3181573Srgrimes prevback = conc; 3191573Srgrimes first = 0; 3201573Srgrimes } 3211573Srgrimes ASTERN(OOR1, prevback); 3221573Srgrimes prevback = THERE(); 3231573Srgrimes AHEAD(prevfwd); /* fix previous offset */ 3241573Srgrimes prevfwd = HERE(); 3251573Srgrimes EMIT(OOR2, 0); /* offset is very wrong */ 3261573Srgrimes } 3271573Srgrimes 3281573Srgrimes if (!first) { /* tail-end fixups */ 3291573Srgrimes AHEAD(prevfwd); 3301573Srgrimes ASTERN(O_CH, prevback); 3311573Srgrimes } 3321573Srgrimes 3331573Srgrimes assert(!MORE() || SEE(stop)); 3341573Srgrimes} 3351573Srgrimes 3361573Srgrimes/* 3371573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 33892889Sobrien == static void p_ere_exp(struct parse *p); 3391573Srgrimes */ 3401573Srgrimesstatic void 341170528Sdelphijp_ere_exp(struct parse *p) 3421573Srgrimes{ 34392889Sobrien char c; 344132019Stjr wint_t wc; 34592889Sobrien sopno pos; 34692889Sobrien int count; 34792889Sobrien int count2; 34892889Sobrien sopno subno; 3491573Srgrimes int wascaret = 0; 3501573Srgrimes 3511573Srgrimes assert(MORE()); /* caller should have ensured this */ 3521573Srgrimes c = GETNEXT(); 3531573Srgrimes 3541573Srgrimes pos = HERE(); 3551573Srgrimes switch (c) { 3561573Srgrimes case '(': 35717141Sjkh (void)REQUIRE(MORE(), REG_EPAREN); 3581573Srgrimes p->g->nsub++; 3591573Srgrimes subno = p->g->nsub; 3601573Srgrimes if (subno < NPAREN) 3611573Srgrimes p->pbegin[subno] = HERE(); 3621573Srgrimes EMIT(OLPAREN, subno); 3631573Srgrimes if (!SEE(')')) 3641573Srgrimes p_ere(p, ')'); 3651573Srgrimes if (subno < NPAREN) { 3661573Srgrimes p->pend[subno] = HERE(); 3671573Srgrimes assert(p->pend[subno] != 0); 3681573Srgrimes } 3691573Srgrimes EMIT(ORPAREN, subno); 37017141Sjkh (void)MUSTEAT(')', REG_EPAREN); 3711573Srgrimes break; 3721573Srgrimes#ifndef POSIX_MISTAKE 3731573Srgrimes case ')': /* happens only if no current unmatched ( */ 3741573Srgrimes /* 3751573Srgrimes * You may ask, why the ifndef? Because I didn't notice 3761573Srgrimes * this until slightly too late for 1003.2, and none of the 3771573Srgrimes * other 1003.2 regular-expression reviewers noticed it at 3781573Srgrimes * all. So an unmatched ) is legal POSIX, at least until 3791573Srgrimes * we can get it fixed. 3801573Srgrimes */ 3811573Srgrimes SETERROR(REG_EPAREN); 3821573Srgrimes break; 3831573Srgrimes#endif 3841573Srgrimes case '^': 3851573Srgrimes EMIT(OBOL, 0); 3861573Srgrimes p->g->iflags |= USEBOL; 3871573Srgrimes p->g->nbol++; 3881573Srgrimes wascaret = 1; 3891573Srgrimes break; 3901573Srgrimes case '$': 3911573Srgrimes EMIT(OEOL, 0); 3921573Srgrimes p->g->iflags |= USEEOL; 3931573Srgrimes p->g->neol++; 3941573Srgrimes break; 3951573Srgrimes case '|': 3961573Srgrimes SETERROR(REG_EMPTY); 3971573Srgrimes break; 3981573Srgrimes case '*': 3991573Srgrimes case '+': 4001573Srgrimes case '?': 4011573Srgrimes SETERROR(REG_BADRPT); 4021573Srgrimes break; 4031573Srgrimes case '.': 4041573Srgrimes if (p->g->cflags®_NEWLINE) 4051573Srgrimes nonnewline(p); 4061573Srgrimes else 4071573Srgrimes EMIT(OANY, 0); 4081573Srgrimes break; 4091573Srgrimes case '[': 4101573Srgrimes p_bracket(p); 4111573Srgrimes break; 4121573Srgrimes case '\\': 41317141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 414132019Stjr wc = WGETNEXT(); 415132019Stjr ordinary(p, wc); 4161573Srgrimes break; 4171573Srgrimes case '{': /* okay as ordinary except if digit follows */ 41849094Sache (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 4191573Srgrimes /* FALLTHROUGH */ 4201573Srgrimes default: 421132019Stjr p->next--; 422132019Stjr wc = WGETNEXT(); 423132019Stjr ordinary(p, wc); 4241573Srgrimes break; 4251573Srgrimes } 4261573Srgrimes 4271573Srgrimes if (!MORE()) 4281573Srgrimes return; 4291573Srgrimes c = PEEK(); 4301573Srgrimes /* we call { a repetition if followed by a digit */ 4311573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 43249094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 4331573Srgrimes return; /* no repetition, we're done */ 4341573Srgrimes NEXT(); 4351573Srgrimes 43617141Sjkh (void)REQUIRE(!wascaret, REG_BADRPT); 4371573Srgrimes switch (c) { 4381573Srgrimes case '*': /* implemented as +? */ 4391573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 4401573Srgrimes INSERT(OPLUS_, pos); 4411573Srgrimes ASTERN(O_PLUS, pos); 4421573Srgrimes INSERT(OQUEST_, pos); 4431573Srgrimes ASTERN(O_QUEST, pos); 4441573Srgrimes break; 4451573Srgrimes case '+': 4461573Srgrimes INSERT(OPLUS_, pos); 4471573Srgrimes ASTERN(O_PLUS, pos); 4481573Srgrimes break; 4491573Srgrimes case '?': 4501573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 4511573Srgrimes INSERT(OCH_, pos); /* offset slightly wrong */ 4521573Srgrimes ASTERN(OOR1, pos); /* this one's right */ 4531573Srgrimes AHEAD(pos); /* fix the OCH_ */ 4541573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 4551573Srgrimes AHEAD(THERE()); /* ...so fix it */ 4561573Srgrimes ASTERN(O_CH, THERETHERE()); 4571573Srgrimes break; 4581573Srgrimes case '{': 4591573Srgrimes count = p_count(p); 4601573Srgrimes if (EAT(',')) { 46149094Sache if (isdigit((uch)PEEK())) { 4621573Srgrimes count2 = p_count(p); 46317141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 4641573Srgrimes } else /* single number with comma */ 4651573Srgrimes count2 = INFINITY; 4661573Srgrimes } else /* just a single number */ 4671573Srgrimes count2 = count; 4681573Srgrimes repeat(p, pos, count, count2); 4691573Srgrimes if (!EAT('}')) { /* error heuristics */ 4701573Srgrimes while (MORE() && PEEK() != '}') 4711573Srgrimes NEXT(); 47217141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 4731573Srgrimes SETERROR(REG_BADBR); 4741573Srgrimes } 4751573Srgrimes break; 4761573Srgrimes } 4771573Srgrimes 4781573Srgrimes if (!MORE()) 4791573Srgrimes return; 4801573Srgrimes c = PEEK(); 4811573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 48249094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 4831573Srgrimes return; 4841573Srgrimes SETERROR(REG_BADRPT); 4851573Srgrimes} 4861573Srgrimes 4871573Srgrimes/* 4881573Srgrimes - p_str - string (no metacharacters) "parser" 48992889Sobrien == static void p_str(struct parse *p); 4901573Srgrimes */ 4911573Srgrimesstatic void 492170528Sdelphijp_str(struct parse *p) 4931573Srgrimes{ 49417141Sjkh (void)REQUIRE(MORE(), REG_EMPTY); 4951573Srgrimes while (MORE()) 496132019Stjr ordinary(p, WGETNEXT()); 4971573Srgrimes} 4981573Srgrimes 4991573Srgrimes/* 5001573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation 501227435Skevlo == static void p_bre(struct parse *p, int end1, \ 502227435Skevlo == int end2); 5031573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check. 5041573Srgrimes * 5051573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first 506131973Stjr * taken as an ordinary character and then revised to be an anchor. 5071573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive. 5081573Srgrimes */ 5091573Srgrimesstatic void 510170528Sdelphijp_bre(struct parse *p, 511227435Skevlo int end1, /* first terminating character */ 512227435Skevlo int end2) /* second terminating character */ 5131573Srgrimes{ 51492889Sobrien sopno start = HERE(); 51592889Sobrien int first = 1; /* first subexpression? */ 51692889Sobrien int wasdollar = 0; 5171573Srgrimes 5181573Srgrimes if (EAT('^')) { 5191573Srgrimes EMIT(OBOL, 0); 5201573Srgrimes p->g->iflags |= USEBOL; 5211573Srgrimes p->g->nbol++; 5221573Srgrimes } 5231573Srgrimes while (MORE() && !SEETWO(end1, end2)) { 5241573Srgrimes wasdollar = p_simp_re(p, first); 5251573Srgrimes first = 0; 5261573Srgrimes } 5271573Srgrimes if (wasdollar) { /* oops, that was a trailing anchor */ 5281573Srgrimes DROP(1); 5291573Srgrimes EMIT(OEOL, 0); 5301573Srgrimes p->g->iflags |= USEEOL; 5311573Srgrimes p->g->neol++; 5321573Srgrimes } 5331573Srgrimes 53417141Sjkh (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 5351573Srgrimes} 5361573Srgrimes 5371573Srgrimes/* 5381573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 53992889Sobrien == static int p_simp_re(struct parse *p, int starordinary); 5401573Srgrimes */ 5411573Srgrimesstatic int /* was the simple RE an unbackslashed $? */ 542170528Sdelphijp_simp_re(struct parse *p, 543170528Sdelphij int starordinary) /* is a leading * an ordinary character? */ 5441573Srgrimes{ 54592889Sobrien int c; 54692889Sobrien int count; 54792889Sobrien int count2; 54892889Sobrien sopno pos; 54992889Sobrien int i; 550132019Stjr wint_t wc; 55192889Sobrien sopno subno; 5521573Srgrimes# define BACKSL (1<<CHAR_BIT) 5531573Srgrimes 5541573Srgrimes pos = HERE(); /* repetion op, if any, covers from here */ 5551573Srgrimes 5561573Srgrimes assert(MORE()); /* caller should have ensured this */ 5571573Srgrimes c = GETNEXT(); 5581573Srgrimes if (c == '\\') { 55917141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 56049094Sache c = BACKSL | GETNEXT(); 5611573Srgrimes } 5621573Srgrimes switch (c) { 5631573Srgrimes case '.': 5641573Srgrimes if (p->g->cflags®_NEWLINE) 5651573Srgrimes nonnewline(p); 5661573Srgrimes else 5671573Srgrimes EMIT(OANY, 0); 5681573Srgrimes break; 5691573Srgrimes case '[': 5701573Srgrimes p_bracket(p); 5711573Srgrimes break; 5721573Srgrimes case BACKSL|'{': 5731573Srgrimes SETERROR(REG_BADRPT); 5741573Srgrimes break; 5751573Srgrimes case BACKSL|'(': 5761573Srgrimes p->g->nsub++; 5771573Srgrimes subno = p->g->nsub; 5781573Srgrimes if (subno < NPAREN) 5791573Srgrimes p->pbegin[subno] = HERE(); 5801573Srgrimes EMIT(OLPAREN, subno); 5811573Srgrimes /* the MORE here is an error heuristic */ 5821573Srgrimes if (MORE() && !SEETWO('\\', ')')) 5831573Srgrimes p_bre(p, '\\', ')'); 5841573Srgrimes if (subno < NPAREN) { 5851573Srgrimes p->pend[subno] = HERE(); 5861573Srgrimes assert(p->pend[subno] != 0); 5871573Srgrimes } 5881573Srgrimes EMIT(ORPAREN, subno); 58917141Sjkh (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 5901573Srgrimes break; 5911573Srgrimes case BACKSL|')': /* should not get here -- must be user */ 5921573Srgrimes case BACKSL|'}': 5931573Srgrimes SETERROR(REG_EPAREN); 5941573Srgrimes break; 5951573Srgrimes case BACKSL|'1': 5961573Srgrimes case BACKSL|'2': 5971573Srgrimes case BACKSL|'3': 5981573Srgrimes case BACKSL|'4': 5991573Srgrimes case BACKSL|'5': 6001573Srgrimes case BACKSL|'6': 6011573Srgrimes case BACKSL|'7': 6021573Srgrimes case BACKSL|'8': 6031573Srgrimes case BACKSL|'9': 6041573Srgrimes i = (c&~BACKSL) - '0'; 6051573Srgrimes assert(i < NPAREN); 6061573Srgrimes if (p->pend[i] != 0) { 6071573Srgrimes assert(i <= p->g->nsub); 6081573Srgrimes EMIT(OBACK_, i); 6091573Srgrimes assert(p->pbegin[i] != 0); 6101573Srgrimes assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 6111573Srgrimes assert(OP(p->strip[p->pend[i]]) == ORPAREN); 6121573Srgrimes (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 6131573Srgrimes EMIT(O_BACK, i); 6141573Srgrimes } else 6151573Srgrimes SETERROR(REG_ESUBREG); 6161573Srgrimes p->g->backrefs = 1; 6171573Srgrimes break; 6181573Srgrimes case '*': 61917141Sjkh (void)REQUIRE(starordinary, REG_BADRPT); 6201573Srgrimes /* FALLTHROUGH */ 6211573Srgrimes default: 622132019Stjr p->next--; 623132019Stjr wc = WGETNEXT(); 624132019Stjr ordinary(p, wc); 6251573Srgrimes break; 6261573Srgrimes } 6271573Srgrimes 6281573Srgrimes if (EAT('*')) { /* implemented as +? */ 6291573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 6301573Srgrimes INSERT(OPLUS_, pos); 6311573Srgrimes ASTERN(O_PLUS, pos); 6321573Srgrimes INSERT(OQUEST_, pos); 6331573Srgrimes ASTERN(O_QUEST, pos); 6341573Srgrimes } else if (EATTWO('\\', '{')) { 6351573Srgrimes count = p_count(p); 6361573Srgrimes if (EAT(',')) { 63749094Sache if (MORE() && isdigit((uch)PEEK())) { 6381573Srgrimes count2 = p_count(p); 63917141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 6401573Srgrimes } else /* single number with comma */ 6411573Srgrimes count2 = INFINITY; 6421573Srgrimes } else /* just a single number */ 6431573Srgrimes count2 = count; 6441573Srgrimes repeat(p, pos, count, count2); 6451573Srgrimes if (!EATTWO('\\', '}')) { /* error heuristics */ 6461573Srgrimes while (MORE() && !SEETWO('\\', '}')) 6471573Srgrimes NEXT(); 64817141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 6491573Srgrimes SETERROR(REG_BADBR); 6501573Srgrimes } 65149094Sache } else if (c == '$') /* $ (but not \$) ends it */ 6521573Srgrimes return(1); 6531573Srgrimes 6541573Srgrimes return(0); 6551573Srgrimes} 6561573Srgrimes 6571573Srgrimes/* 6581573Srgrimes - p_count - parse a repetition count 65992889Sobrien == static int p_count(struct parse *p); 6601573Srgrimes */ 6611573Srgrimesstatic int /* the value */ 662170528Sdelphijp_count(struct parse *p) 6631573Srgrimes{ 66492889Sobrien int count = 0; 66592889Sobrien int ndigits = 0; 6661573Srgrimes 66749094Sache while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 6681573Srgrimes count = count*10 + (GETNEXT() - '0'); 6691573Srgrimes ndigits++; 6701573Srgrimes } 6711573Srgrimes 67217141Sjkh (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 6731573Srgrimes return(count); 6741573Srgrimes} 6751573Srgrimes 6761573Srgrimes/* 6771573Srgrimes - p_bracket - parse a bracketed character list 67892889Sobrien == static void p_bracket(struct parse *p); 6791573Srgrimes */ 6801573Srgrimesstatic void 681170528Sdelphijp_bracket(struct parse *p) 6821573Srgrimes{ 683132019Stjr cset *cs; 684132019Stjr wint_t ch; 6851573Srgrimes 6861573Srgrimes /* Dept of Truly Sickening Special-Case Kludges */ 6871573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 6881573Srgrimes EMIT(OBOW, 0); 6891573Srgrimes NEXTn(6); 6901573Srgrimes return; 6911573Srgrimes } 6921573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 6931573Srgrimes EMIT(OEOW, 0); 6941573Srgrimes NEXTn(6); 6951573Srgrimes return; 6961573Srgrimes } 6971573Srgrimes 698132019Stjr if ((cs = allocset(p)) == NULL) 699132019Stjr return; 700132019Stjr 701132019Stjr if (p->g->cflags®_ICASE) 702132019Stjr cs->icase = 1; 7031573Srgrimes if (EAT('^')) 704132019Stjr cs->invert = 1; 7051573Srgrimes if (EAT(']')) 706132019Stjr CHadd(p, cs, ']'); 7071573Srgrimes else if (EAT('-')) 708132019Stjr CHadd(p, cs, '-'); 7091573Srgrimes while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 7101573Srgrimes p_b_term(p, cs); 7111573Srgrimes if (EAT('-')) 712132019Stjr CHadd(p, cs, '-'); 71317141Sjkh (void)MUSTEAT(']', REG_EBRACK); 7141573Srgrimes 7151573Srgrimes if (p->error != 0) /* don't mess things up further */ 7161573Srgrimes return; 7171573Srgrimes 718132019Stjr if (cs->invert && p->g->cflags®_NEWLINE) 719132019Stjr cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 7201573Srgrimes 721132019Stjr if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 722132019Stjr ordinary(p, ch); 7231573Srgrimes freeset(p, cs); 7241573Srgrimes } else 725132019Stjr EMIT(OANYOF, (int)(cs - p->g->sets)); 7261573Srgrimes} 7271573Srgrimes 7281573Srgrimes/* 7291573Srgrimes - p_b_term - parse one term of a bracketed character list 73092889Sobrien == static void p_b_term(struct parse *p, cset *cs); 7311573Srgrimes */ 7321573Srgrimesstatic void 733170528Sdelphijp_b_term(struct parse *p, cset *cs) 7341573Srgrimes{ 73592889Sobrien char c; 736132019Stjr wint_t start, finish; 737132019Stjr wint_t i; 738227753Stheraven struct xlocale_collate *table = 739227753Stheraven (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 7401573Srgrimes 7411573Srgrimes /* classify what we've got */ 7421573Srgrimes switch ((MORE()) ? PEEK() : '\0') { 7431573Srgrimes case '[': 7441573Srgrimes c = (MORE2()) ? PEEK2() : '\0'; 7451573Srgrimes break; 7461573Srgrimes case '-': 7471573Srgrimes SETERROR(REG_ERANGE); 7481573Srgrimes return; /* NOTE RETURN */ 7491573Srgrimes break; 7501573Srgrimes default: 7511573Srgrimes c = '\0'; 7521573Srgrimes break; 7531573Srgrimes } 7541573Srgrimes 7551573Srgrimes switch (c) { 7561573Srgrimes case ':': /* character class */ 7571573Srgrimes NEXT2(); 75817141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 7591573Srgrimes c = PEEK(); 76017141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 7611573Srgrimes p_b_cclass(p, cs); 76217141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 76317141Sjkh (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 7641573Srgrimes break; 7651573Srgrimes case '=': /* equivalence class */ 7661573Srgrimes NEXT2(); 76717141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 7681573Srgrimes c = PEEK(); 76917141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 7701573Srgrimes p_b_eclass(p, cs); 77117141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 77217141Sjkh (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 7731573Srgrimes break; 7741573Srgrimes default: /* symbol, ordinary character, or range */ 7751573Srgrimes start = p_b_symbol(p); 7761573Srgrimes if (SEE('-') && MORE2() && PEEK2() != ']') { 7771573Srgrimes /* range */ 7781573Srgrimes NEXT(); 7791573Srgrimes if (EAT('-')) 7801573Srgrimes finish = '-'; 7811573Srgrimes else 7821573Srgrimes finish = p_b_symbol(p); 7831573Srgrimes } else 7841573Srgrimes finish = start; 78517514Sache if (start == finish) 786132019Stjr CHadd(p, cs, start); 78717514Sache else { 788227753Stheraven if (table->__collate_load_error) { 78924637Sache (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 790132019Stjr CHaddrange(p, cs, start, finish); 79124637Sache } else { 792227753Stheraven (void)REQUIRE(__collate_range_cmp(table, start, finish) <= 0, REG_ERANGE); 793132019Stjr for (i = 0; i <= UCHAR_MAX; i++) { 794227753Stheraven if ( __collate_range_cmp(table, start, i) <= 0 795227753Stheraven && __collate_range_cmp(table, i, finish) <= 0 79624637Sache ) 797132019Stjr CHadd(p, cs, i); 79824637Sache } 79917514Sache } 80017514Sache } 8011573Srgrimes break; 8021573Srgrimes } 8031573Srgrimes} 8041573Srgrimes 8051573Srgrimes/* 8061573Srgrimes - p_b_cclass - parse a character-class name and deal with it 80792889Sobrien == static void p_b_cclass(struct parse *p, cset *cs); 8081573Srgrimes */ 8091573Srgrimesstatic void 810170528Sdelphijp_b_cclass(struct parse *p, cset *cs) 8111573Srgrimes{ 81292889Sobrien char *sp = p->next; 81392889Sobrien size_t len; 814132019Stjr wctype_t wct; 815132019Stjr char clname[16]; 8161573Srgrimes 81717508Sache while (MORE() && isalpha((uch)PEEK())) 8181573Srgrimes NEXT(); 8191573Srgrimes len = p->next - sp; 820132019Stjr if (len >= sizeof(clname) - 1) { 8211573Srgrimes SETERROR(REG_ECTYPE); 8221573Srgrimes return; 8231573Srgrimes } 824132019Stjr memcpy(clname, sp, len); 825132019Stjr clname[len] = '\0'; 826132019Stjr if ((wct = wctype(clname)) == 0) { 827132019Stjr SETERROR(REG_ECTYPE); 828132019Stjr return; 82917508Sache } 830132019Stjr CHaddtype(p, cs, wct); 8311573Srgrimes} 8321573Srgrimes 8331573Srgrimes/* 8341573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it 83592889Sobrien == static void p_b_eclass(struct parse *p, cset *cs); 8361573Srgrimes * 8371573Srgrimes * This implementation is incomplete. xxx 8381573Srgrimes */ 8391573Srgrimesstatic void 840170528Sdelphijp_b_eclass(struct parse *p, cset *cs) 8411573Srgrimes{ 842132019Stjr wint_t c; 8431573Srgrimes 8441573Srgrimes c = p_b_coll_elem(p, '='); 845132019Stjr CHadd(p, cs, c); 8461573Srgrimes} 8471573Srgrimes 8481573Srgrimes/* 8491573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 850227414Skevlo == static wint_t p_b_symbol(struct parse *p); 8511573Srgrimes */ 852132019Stjrstatic wint_t /* value of symbol */ 853170528Sdelphijp_b_symbol(struct parse *p) 8541573Srgrimes{ 855132019Stjr wint_t value; 8561573Srgrimes 85717141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 8581573Srgrimes if (!EATTWO('[', '.')) 859132019Stjr return(WGETNEXT()); 8601573Srgrimes 8611573Srgrimes /* collating symbol */ 8621573Srgrimes value = p_b_coll_elem(p, '.'); 86317141Sjkh (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 8641573Srgrimes return(value); 8651573Srgrimes} 8661573Srgrimes 8671573Srgrimes/* 8681573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up 869227414Skevlo == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 8701573Srgrimes */ 871132019Stjrstatic wint_t /* value of collating element */ 872170528Sdelphijp_b_coll_elem(struct parse *p, 873170528Sdelphij wint_t endc) /* name ended by endc,']' */ 8741573Srgrimes{ 87592889Sobrien char *sp = p->next; 87692889Sobrien struct cname *cp; 87792889Sobrien int len; 878132019Stjr mbstate_t mbs; 879132019Stjr wchar_t wc; 880132019Stjr size_t clen; 8811573Srgrimes 8821573Srgrimes while (MORE() && !SEETWO(endc, ']')) 8831573Srgrimes NEXT(); 8841573Srgrimes if (!MORE()) { 8851573Srgrimes SETERROR(REG_EBRACK); 8861573Srgrimes return(0); 8871573Srgrimes } 8881573Srgrimes len = p->next - sp; 8891573Srgrimes for (cp = cnames; cp->name != NULL; cp++) 8901573Srgrimes if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 8911573Srgrimes return(cp->code); /* known name */ 892132019Stjr memset(&mbs, 0, sizeof(mbs)); 893132019Stjr if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 894132019Stjr return (wc); /* single character */ 895132019Stjr else if (clen == (size_t)-1 || clen == (size_t)-2) 896132019Stjr SETERROR(REG_ILLSEQ); 897132019Stjr else 898132019Stjr SETERROR(REG_ECOLLATE); /* neither */ 8991573Srgrimes return(0); 9001573Srgrimes} 9011573Srgrimes 9021573Srgrimes/* 9031573Srgrimes - othercase - return the case counterpart of an alphabetic 904227414Skevlo == static wint_t othercase(wint_t ch); 9051573Srgrimes */ 906132019Stjrstatic wint_t /* if no counterpart, return ch */ 907170528Sdelphijothercase(wint_t ch) 9081573Srgrimes{ 909132019Stjr assert(iswalpha(ch)); 910132019Stjr if (iswupper(ch)) 911132019Stjr return(towlower(ch)); 912132019Stjr else if (iswlower(ch)) 913132019Stjr return(towupper(ch)); 9141573Srgrimes else /* peculiar, but could happen */ 9151573Srgrimes return(ch); 9161573Srgrimes} 9171573Srgrimes 9181573Srgrimes/* 9191573Srgrimes - bothcases - emit a dualcase version of a two-case character 920227414Skevlo == static void bothcases(struct parse *p, wint_t ch); 9211573Srgrimes * 9221573Srgrimes * Boy, is this implementation ever a kludge... 9231573Srgrimes */ 9241573Srgrimesstatic void 925170528Sdelphijbothcases(struct parse *p, wint_t ch) 9261573Srgrimes{ 92792889Sobrien char *oldnext = p->next; 92892889Sobrien char *oldend = p->end; 929132019Stjr char bracket[3 + MB_LEN_MAX]; 930132019Stjr size_t n; 931132019Stjr mbstate_t mbs; 9321573Srgrimes 9331573Srgrimes assert(othercase(ch) != ch); /* p_bracket() would recurse */ 9341573Srgrimes p->next = bracket; 935132019Stjr memset(&mbs, 0, sizeof(mbs)); 936132019Stjr n = wcrtomb(bracket, ch, &mbs); 937132019Stjr assert(n != (size_t)-1); 938132019Stjr bracket[n] = ']'; 939132019Stjr bracket[n + 1] = '\0'; 940132019Stjr p->end = bracket+n+1; 9411573Srgrimes p_bracket(p); 942132019Stjr assert(p->next == p->end); 9431573Srgrimes p->next = oldnext; 9441573Srgrimes p->end = oldend; 9451573Srgrimes} 9461573Srgrimes 9471573Srgrimes/* 9481573Srgrimes - ordinary - emit an ordinary character 949227414Skevlo == static void ordinary(struct parse *p, wint_t ch); 9501573Srgrimes */ 9511573Srgrimesstatic void 952170528Sdelphijordinary(struct parse *p, wint_t ch) 9531573Srgrimes{ 954132019Stjr cset *cs; 9551573Srgrimes 956132019Stjr if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 9571573Srgrimes bothcases(p, ch); 958132019Stjr else if ((ch & OPDMASK) == ch) 959132019Stjr EMIT(OCHAR, ch); 960132019Stjr else { 961132019Stjr /* 962132019Stjr * Kludge: character is too big to fit into an OCHAR operand. 963132019Stjr * Emit a singleton set. 964132019Stjr */ 965132019Stjr if ((cs = allocset(p)) == NULL) 966132019Stjr return; 967132019Stjr CHadd(p, cs, ch); 968132019Stjr EMIT(OANYOF, (int)(cs - p->g->sets)); 969132019Stjr } 9701573Srgrimes} 9711573Srgrimes 9721573Srgrimes/* 9731573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY 97492889Sobrien == static void nonnewline(struct parse *p); 9751573Srgrimes * 9761573Srgrimes * Boy, is this implementation ever a kludge... 9771573Srgrimes */ 9781573Srgrimesstatic void 979170528Sdelphijnonnewline(struct parse *p) 9801573Srgrimes{ 98192889Sobrien char *oldnext = p->next; 98292889Sobrien char *oldend = p->end; 9831573Srgrimes char bracket[4]; 9841573Srgrimes 9851573Srgrimes p->next = bracket; 9861573Srgrimes p->end = bracket+3; 9871573Srgrimes bracket[0] = '^'; 9881573Srgrimes bracket[1] = '\n'; 9891573Srgrimes bracket[2] = ']'; 9901573Srgrimes bracket[3] = '\0'; 9911573Srgrimes p_bracket(p); 9921573Srgrimes assert(p->next == bracket+3); 9931573Srgrimes p->next = oldnext; 9941573Srgrimes p->end = oldend; 9951573Srgrimes} 9961573Srgrimes 9971573Srgrimes/* 9981573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed 99992889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to); 10001573Srgrimes */ 10011573Srgrimesstatic void 1002170528Sdelphijrepeat(struct parse *p, 1003170528Sdelphij sopno start, /* operand from here to end of strip */ 1004170528Sdelphij int from, /* repeated from this number */ 1005170528Sdelphij int to) /* to this number of times (maybe INFINITY) */ 10061573Srgrimes{ 100792889Sobrien sopno finish = HERE(); 10081573Srgrimes# define N 2 10091573Srgrimes# define INF 3 10101573Srgrimes# define REP(f, t) ((f)*8 + (t)) 10111573Srgrimes# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 101292889Sobrien sopno copy; 10131573Srgrimes 10141573Srgrimes if (p->error != 0) /* head off possible runaway recursion */ 10151573Srgrimes return; 10161573Srgrimes 10171573Srgrimes assert(from <= to); 10181573Srgrimes 10191573Srgrimes switch (REP(MAP(from), MAP(to))) { 10201573Srgrimes case REP(0, 0): /* must be user doing this */ 10211573Srgrimes DROP(finish-start); /* drop the operand */ 10221573Srgrimes break; 10231573Srgrimes case REP(0, 1): /* as x{1,1}? */ 10241573Srgrimes case REP(0, N): /* as x{1,n}? */ 10251573Srgrimes case REP(0, INF): /* as x{1,}? */ 10261573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10271573Srgrimes INSERT(OCH_, start); /* offset is wrong... */ 10281573Srgrimes repeat(p, start+1, 1, to); 10291573Srgrimes ASTERN(OOR1, start); 10301573Srgrimes AHEAD(start); /* ... fix it */ 10311573Srgrimes EMIT(OOR2, 0); 10321573Srgrimes AHEAD(THERE()); 10331573Srgrimes ASTERN(O_CH, THERETHERE()); 10341573Srgrimes break; 10351573Srgrimes case REP(1, 1): /* trivial case */ 10361573Srgrimes /* done */ 10371573Srgrimes break; 10381573Srgrimes case REP(1, N): /* as x?x{1,n-1} */ 10391573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10401573Srgrimes INSERT(OCH_, start); 10411573Srgrimes ASTERN(OOR1, start); 10421573Srgrimes AHEAD(start); 10431573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 10441573Srgrimes AHEAD(THERE()); /* ...so fix it */ 10451573Srgrimes ASTERN(O_CH, THERETHERE()); 10461573Srgrimes copy = dupl(p, start+1, finish+1); 10471573Srgrimes assert(copy == finish+4); 10481573Srgrimes repeat(p, copy, 1, to-1); 10491573Srgrimes break; 10501573Srgrimes case REP(1, INF): /* as x+ */ 10511573Srgrimes INSERT(OPLUS_, start); 10521573Srgrimes ASTERN(O_PLUS, start); 10531573Srgrimes break; 10541573Srgrimes case REP(N, N): /* as xx{m-1,n-1} */ 10551573Srgrimes copy = dupl(p, start, finish); 10561573Srgrimes repeat(p, copy, from-1, to-1); 10571573Srgrimes break; 10581573Srgrimes case REP(N, INF): /* as xx{n-1,INF} */ 10591573Srgrimes copy = dupl(p, start, finish); 10601573Srgrimes repeat(p, copy, from-1, to); 10611573Srgrimes break; 10621573Srgrimes default: /* "can't happen" */ 10631573Srgrimes SETERROR(REG_ASSERT); /* just in case */ 10641573Srgrimes break; 10651573Srgrimes } 10661573Srgrimes} 10671573Srgrimes 10681573Srgrimes/* 1069132019Stjr - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1070132019Stjr - character from the parse struct, signals a REG_ILLSEQ error if the 1071132019Stjr - character can't be converted. Returns the number of bytes consumed. 1072132019Stjr */ 1073132019Stjrstatic wint_t 1074170528Sdelphijwgetnext(struct parse *p) 1075132019Stjr{ 1076132019Stjr mbstate_t mbs; 1077132019Stjr wchar_t wc; 1078132019Stjr size_t n; 1079132019Stjr 1080132019Stjr memset(&mbs, 0, sizeof(mbs)); 1081132019Stjr n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1082132019Stjr if (n == (size_t)-1 || n == (size_t)-2) { 1083132019Stjr SETERROR(REG_ILLSEQ); 1084132019Stjr return (0); 1085132019Stjr } 1086132019Stjr if (n == 0) 1087132019Stjr n = 1; 1088132019Stjr p->next += n; 1089132019Stjr return (wc); 1090132019Stjr} 1091132019Stjr 1092132019Stjr/* 10931573Srgrimes - seterr - set an error condition 109492889Sobrien == static int seterr(struct parse *p, int e); 10951573Srgrimes */ 10961573Srgrimesstatic int /* useless but makes type checking happy */ 1097170528Sdelphijseterr(struct parse *p, int e) 10981573Srgrimes{ 10991573Srgrimes if (p->error == 0) /* keep earliest error condition */ 11001573Srgrimes p->error = e; 11011573Srgrimes p->next = nuls; /* try to bring things to a halt */ 11021573Srgrimes p->end = nuls; 11031573Srgrimes return(0); /* make the return value well-defined */ 11041573Srgrimes} 11051573Srgrimes 11061573Srgrimes/* 11071573Srgrimes - allocset - allocate a set of characters for [] 110892889Sobrien == static cset *allocset(struct parse *p); 11091573Srgrimes */ 11101573Srgrimesstatic cset * 1111170528Sdelphijallocset(struct parse *p) 11121573Srgrimes{ 1113132019Stjr cset *cs, *ncs; 11141573Srgrimes 1115132019Stjr ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); 1116132019Stjr if (ncs == NULL) { 1117132019Stjr SETERROR(REG_ESPACE); 1118132019Stjr return (NULL); 11191573Srgrimes } 1120132019Stjr p->g->sets = ncs; 1121132019Stjr cs = &p->g->sets[p->g->ncsets++]; 1122132019Stjr memset(cs, 0, sizeof(*cs)); 11231573Srgrimes 11241573Srgrimes return(cs); 11251573Srgrimes} 11261573Srgrimes 11271573Srgrimes/* 11281573Srgrimes - freeset - free a now-unused set 112992889Sobrien == static void freeset(struct parse *p, cset *cs); 11301573Srgrimes */ 11311573Srgrimesstatic void 1132170528Sdelphijfreeset(struct parse *p, cset *cs) 11331573Srgrimes{ 113492889Sobrien cset *top = &p->g->sets[p->g->ncsets]; 11351573Srgrimes 1136132019Stjr free(cs->wides); 1137132019Stjr free(cs->ranges); 1138132019Stjr free(cs->types); 1139132019Stjr memset(cs, 0, sizeof(*cs)); 11401573Srgrimes if (cs == top-1) /* recover only the easy case */ 11411573Srgrimes p->g->ncsets--; 11421573Srgrimes} 11431573Srgrimes 11441573Srgrimes/* 1145132019Stjr - singleton - Determine whether a set contains only one character, 1146132019Stjr - returning it if so, otherwise returning OUT. 11471573Srgrimes */ 1148132019Stjrstatic wint_t 1149170528Sdelphijsingleton(cset *cs) 11501573Srgrimes{ 1151132019Stjr wint_t i, s, n; 11521573Srgrimes 1153132019Stjr for (i = n = 0; i < NC; i++) 1154132019Stjr if (CHIN(cs, i)) { 1155132019Stjr n++; 1156132019Stjr s = i; 11571573Srgrimes } 1158132019Stjr if (n == 1) 1159132019Stjr return (s); 1160132019Stjr if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1161132019Stjr cs->icase == 0) 1162132019Stjr return (cs->wides[0]); 1163132019Stjr /* Don't bother handling the other cases. */ 1164132019Stjr return (OUT); 1165132019Stjr} 11661573Srgrimes 1167132019Stjr/* 1168132019Stjr - CHadd - add character to character set. 1169132019Stjr */ 1170132019Stjrstatic void 1171170528SdelphijCHadd(struct parse *p, cset *cs, wint_t ch) 1172132019Stjr{ 1173134802Stjr wint_t nch, *newwides; 1174132019Stjr assert(ch >= 0); 1175134802Stjr if (ch < NC) 1176132019Stjr cs->bmp[ch >> 3] |= 1 << (ch & 7); 1177134802Stjr else { 1178132019Stjr newwides = realloc(cs->wides, (cs->nwides + 1) * 1179132019Stjr sizeof(*cs->wides)); 1180132019Stjr if (newwides == NULL) { 1181132019Stjr SETERROR(REG_ESPACE); 1182132019Stjr return; 1183132019Stjr } 1184132019Stjr cs->wides = newwides; 1185132019Stjr cs->wides[cs->nwides++] = ch; 11861573Srgrimes } 1187134802Stjr if (cs->icase) { 1188134802Stjr if ((nch = towlower(ch)) < NC) 1189134802Stjr cs->bmp[nch >> 3] |= 1 << (nch & 7); 1190134802Stjr if ((nch = towupper(ch)) < NC) 1191134802Stjr cs->bmp[nch >> 3] |= 1 << (nch & 7); 1192134802Stjr } 11931573Srgrimes} 11941573Srgrimes 11951573Srgrimes/* 1196132019Stjr - CHaddrange - add all characters in the range [min,max] to a character set. 11971573Srgrimes */ 1198132019Stjrstatic void 1199170528SdelphijCHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 12001573Srgrimes{ 1201132019Stjr crange *newranges; 12021573Srgrimes 1203132019Stjr for (; min < NC && min <= max; min++) 1204132019Stjr CHadd(p, cs, min); 1205132019Stjr if (min >= max) 1206132019Stjr return; 1207132019Stjr newranges = realloc(cs->ranges, (cs->nranges + 1) * 1208132019Stjr sizeof(*cs->ranges)); 1209132019Stjr if (newranges == NULL) { 1210132019Stjr SETERROR(REG_ESPACE); 1211132019Stjr return; 1212132019Stjr } 1213132019Stjr cs->ranges = newranges; 1214132019Stjr cs->ranges[cs->nranges].min = min; 1215247596Sdelphij cs->ranges[cs->nranges].max = max; 1216132019Stjr cs->nranges++; 12171573Srgrimes} 12181573Srgrimes 12191573Srgrimes/* 1220132019Stjr - CHaddtype - add all characters of a certain type to a character set. 12211573Srgrimes */ 1222132019Stjrstatic void 1223170528SdelphijCHaddtype(struct parse *p, cset *cs, wctype_t wct) 12241573Srgrimes{ 1225132019Stjr wint_t i; 1226132019Stjr wctype_t *newtypes; 12271573Srgrimes 1228134802Stjr for (i = 0; i < NC; i++) 1229132019Stjr if (iswctype(i, wct)) 1230132019Stjr CHadd(p, cs, i); 1231132019Stjr newtypes = realloc(cs->types, (cs->ntypes + 1) * 1232132019Stjr sizeof(*cs->types)); 1233132019Stjr if (newtypes == NULL) { 1234132019Stjr SETERROR(REG_ESPACE); 1235132019Stjr return; 1236132019Stjr } 1237132019Stjr cs->types = newtypes; 1238132019Stjr cs->types[cs->ntypes++] = wct; 12391573Srgrimes} 12401573Srgrimes 12411573Srgrimes/* 12421573Srgrimes - dupl - emit a duplicate of a bunch of sops 124392889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish); 12441573Srgrimes */ 12451573Srgrimesstatic sopno /* start of duplicate */ 1246170528Sdelphijdupl(struct parse *p, 1247170528Sdelphij sopno start, /* from here */ 1248170528Sdelphij sopno finish) /* to this less one */ 12491573Srgrimes{ 125092889Sobrien sopno ret = HERE(); 125192889Sobrien sopno len = finish - start; 12521573Srgrimes 12531573Srgrimes assert(finish >= start); 12541573Srgrimes if (len == 0) 12551573Srgrimes return(ret); 1256227414Skevlo if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1257227414Skevlo return(ret); 12581573Srgrimes (void) memcpy((char *)(p->strip + p->slen), 12591573Srgrimes (char *)(p->strip + start), (size_t)len*sizeof(sop)); 12601573Srgrimes p->slen += len; 12611573Srgrimes return(ret); 12621573Srgrimes} 12631573Srgrimes 12641573Srgrimes/* 12651573Srgrimes - doemit - emit a strip operator 126692889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd); 12671573Srgrimes * 12681573Srgrimes * It might seem better to implement this as a macro with a function as 12691573Srgrimes * hard-case backup, but it's just too big and messy unless there are 12701573Srgrimes * some changes to the data structures. Maybe later. 12711573Srgrimes */ 12721573Srgrimesstatic void 1273170528Sdelphijdoemit(struct parse *p, sop op, size_t opnd) 12741573Srgrimes{ 12751573Srgrimes /* avoid making error situations worse */ 12761573Srgrimes if (p->error != 0) 12771573Srgrimes return; 12781573Srgrimes 12791573Srgrimes /* deal with oversize operands ("can't happen", more or less) */ 12801573Srgrimes assert(opnd < 1<<OPSHIFT); 12811573Srgrimes 12821573Srgrimes /* deal with undersized strip */ 12831573Srgrimes if (p->slen >= p->ssize) 1284227414Skevlo if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1285227414Skevlo return; 12861573Srgrimes 12871573Srgrimes /* finally, it's all reduced to the easy case */ 12881573Srgrimes p->strip[p->slen++] = SOP(op, opnd); 12891573Srgrimes} 12901573Srgrimes 12911573Srgrimes/* 12921573Srgrimes - doinsert - insert a sop into the strip 129392889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 12941573Srgrimes */ 12951573Srgrimesstatic void 1296170528Sdelphijdoinsert(struct parse *p, sop op, size_t opnd, sopno pos) 12971573Srgrimes{ 129892889Sobrien sopno sn; 129992889Sobrien sop s; 130092889Sobrien int i; 13011573Srgrimes 13021573Srgrimes /* avoid making error situations worse */ 13031573Srgrimes if (p->error != 0) 13041573Srgrimes return; 13051573Srgrimes 13061573Srgrimes sn = HERE(); 13071573Srgrimes EMIT(op, opnd); /* do checks, ensure space */ 13081573Srgrimes assert(HERE() == sn+1); 13091573Srgrimes s = p->strip[sn]; 13101573Srgrimes 13111573Srgrimes /* adjust paren pointers */ 13121573Srgrimes assert(pos > 0); 13131573Srgrimes for (i = 1; i < NPAREN; i++) { 13141573Srgrimes if (p->pbegin[i] >= pos) { 13151573Srgrimes p->pbegin[i]++; 13161573Srgrimes } 13171573Srgrimes if (p->pend[i] >= pos) { 13181573Srgrimes p->pend[i]++; 13191573Srgrimes } 13201573Srgrimes } 13211573Srgrimes 13221573Srgrimes memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 13231573Srgrimes (HERE()-pos-1)*sizeof(sop)); 13241573Srgrimes p->strip[pos] = s; 13251573Srgrimes} 13261573Srgrimes 13271573Srgrimes/* 13281573Srgrimes - dofwd - complete a forward reference 132992889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value); 13301573Srgrimes */ 13311573Srgrimesstatic void 1332170528Sdelphijdofwd(struct parse *p, sopno pos, sop value) 13331573Srgrimes{ 13341573Srgrimes /* avoid making error situations worse */ 13351573Srgrimes if (p->error != 0) 13361573Srgrimes return; 13371573Srgrimes 13381573Srgrimes assert(value < 1<<OPSHIFT); 13391573Srgrimes p->strip[pos] = OP(p->strip[pos]) | value; 13401573Srgrimes} 13411573Srgrimes 13421573Srgrimes/* 13431573Srgrimes - enlarge - enlarge the strip 1344227414Skevlo == static int enlarge(struct parse *p, sopno size); 13451573Srgrimes */ 1346227414Skevlostatic int 1347170528Sdelphijenlarge(struct parse *p, sopno size) 13481573Srgrimes{ 134992889Sobrien sop *sp; 13501573Srgrimes 13511573Srgrimes if (p->ssize >= size) 1352227414Skevlo return 1; 13531573Srgrimes 13541573Srgrimes sp = (sop *)realloc(p->strip, size*sizeof(sop)); 13551573Srgrimes if (sp == NULL) { 13561573Srgrimes SETERROR(REG_ESPACE); 1357227414Skevlo return 0; 13581573Srgrimes } 13591573Srgrimes p->strip = sp; 13601573Srgrimes p->ssize = size; 1361227414Skevlo return 1; 13621573Srgrimes} 13631573Srgrimes 13641573Srgrimes/* 13651573Srgrimes - stripsnug - compact the strip 136692889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g); 13671573Srgrimes */ 13681573Srgrimesstatic void 1369170528Sdelphijstripsnug(struct parse *p, struct re_guts *g) 13701573Srgrimes{ 13711573Srgrimes g->nstates = p->slen; 13721573Srgrimes g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 13731573Srgrimes if (g->strip == NULL) { 13741573Srgrimes SETERROR(REG_ESPACE); 13751573Srgrimes g->strip = p->strip; 13761573Srgrimes } 13771573Srgrimes} 13781573Srgrimes 13791573Srgrimes/* 13801573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string 138192889Sobrien == static void findmust(struct parse *p, struct re_guts *g); 13821573Srgrimes * 13831573Srgrimes * This algorithm could do fancy things like analyzing the operands of | 13841573Srgrimes * for common subsequences. Someday. This code is simple and finds most 13851573Srgrimes * of the interesting cases. 13861573Srgrimes * 13871573Srgrimes * Note that must and mlen got initialized during setup. 13881573Srgrimes */ 13891573Srgrimesstatic void 1390170528Sdelphijfindmust(struct parse *p, struct re_guts *g) 13911573Srgrimes{ 139292889Sobrien sop *scan; 13931573Srgrimes sop *start; 139492889Sobrien sop *newstart; 139592889Sobrien sopno newlen; 139692889Sobrien sop s; 139792889Sobrien char *cp; 139862391Sdcs int offset; 1399132019Stjr char buf[MB_LEN_MAX]; 1400132019Stjr size_t clen; 1401132019Stjr mbstate_t mbs; 14021573Srgrimes 14031573Srgrimes /* avoid making error situations worse */ 14041573Srgrimes if (p->error != 0) 14051573Srgrimes return; 14061573Srgrimes 1407132019Stjr /* 1408132019Stjr * It's not generally safe to do a ``char'' substring search on 1409132019Stjr * multibyte character strings, but it's safe for at least 1410132019Stjr * UTF-8 (see RFC 3629). 1411132019Stjr */ 1412132019Stjr if (MB_CUR_MAX > 1 && 1413132019Stjr strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1414132019Stjr return; 1415132019Stjr 14161573Srgrimes /* find the longest OCHAR sequence in strip */ 14171573Srgrimes newlen = 0; 141862391Sdcs offset = 0; 141962391Sdcs g->moffset = 0; 14201573Srgrimes scan = g->strip + 1; 14211573Srgrimes do { 14221573Srgrimes s = *scan++; 14231573Srgrimes switch (OP(s)) { 14241573Srgrimes case OCHAR: /* sequence member */ 1425132019Stjr if (newlen == 0) { /* new sequence */ 1426132019Stjr memset(&mbs, 0, sizeof(mbs)); 14271573Srgrimes newstart = scan - 1; 1428132019Stjr } 1429132019Stjr clen = wcrtomb(buf, OPND(s), &mbs); 1430132019Stjr if (clen == (size_t)-1) 1431132019Stjr goto toohard; 1432132019Stjr newlen += clen; 14331573Srgrimes break; 14341573Srgrimes case OPLUS_: /* things that don't break one */ 14351573Srgrimes case OLPAREN: 14361573Srgrimes case ORPAREN: 14371573Srgrimes break; 14381573Srgrimes case OQUEST_: /* things that must be skipped */ 14391573Srgrimes case OCH_: 1440131973Stjr offset = altoffset(scan, offset); 14411573Srgrimes scan--; 14421573Srgrimes do { 14431573Srgrimes scan += OPND(s); 14441573Srgrimes s = *scan; 14451573Srgrimes /* assert() interferes w debug printouts */ 14461573Srgrimes if (OP(s) != O_QUEST && OP(s) != O_CH && 14471573Srgrimes OP(s) != OOR2) { 14481573Srgrimes g->iflags |= BAD; 14491573Srgrimes return; 14501573Srgrimes } 14511573Srgrimes } while (OP(s) != O_QUEST && OP(s) != O_CH); 1452102411Scharnier /* FALLTHROUGH */ 145362391Sdcs case OBOW: /* things that break a sequence */ 145462391Sdcs case OEOW: 145562391Sdcs case OBOL: 145662391Sdcs case OEOL: 145762391Sdcs case O_QUEST: 145862391Sdcs case O_CH: 145962391Sdcs case OEND: 14601573Srgrimes if (newlen > g->mlen) { /* ends one */ 14611573Srgrimes start = newstart; 14621573Srgrimes g->mlen = newlen; 146362391Sdcs if (offset > -1) { 146462391Sdcs g->moffset += offset; 146562391Sdcs offset = newlen; 146662391Sdcs } else 146762391Sdcs g->moffset = offset; 146862391Sdcs } else { 146962391Sdcs if (offset > -1) 147062391Sdcs offset += newlen; 14711573Srgrimes } 14721573Srgrimes newlen = 0; 14731573Srgrimes break; 147462391Sdcs case OANY: 147562391Sdcs if (newlen > g->mlen) { /* ends one */ 147662391Sdcs start = newstart; 147762391Sdcs g->mlen = newlen; 147862391Sdcs if (offset > -1) { 147962391Sdcs g->moffset += offset; 148062391Sdcs offset = newlen; 148162391Sdcs } else 148262391Sdcs g->moffset = offset; 148362391Sdcs } else { 148462391Sdcs if (offset > -1) 148562391Sdcs offset += newlen; 148662391Sdcs } 148762391Sdcs if (offset > -1) 148862391Sdcs offset++; 148962391Sdcs newlen = 0; 149062391Sdcs break; 149162391Sdcs case OANYOF: /* may or may not invalidate offset */ 149262391Sdcs /* First, everything as OANY */ 149362391Sdcs if (newlen > g->mlen) { /* ends one */ 149462391Sdcs start = newstart; 149562391Sdcs g->mlen = newlen; 149662391Sdcs if (offset > -1) { 149762391Sdcs g->moffset += offset; 149862391Sdcs offset = newlen; 149962391Sdcs } else 150062391Sdcs g->moffset = offset; 150162391Sdcs } else { 150262391Sdcs if (offset > -1) 150362391Sdcs offset += newlen; 150462391Sdcs } 150562391Sdcs if (offset > -1) 150662391Sdcs offset++; 150762391Sdcs newlen = 0; 150862391Sdcs break; 1509132019Stjr toohard: 151062391Sdcs default: 151162391Sdcs /* Anything here makes it impossible or too hard 151262391Sdcs * to calculate the offset -- so we give up; 151362391Sdcs * save the last known good offset, in case the 151462391Sdcs * must sequence doesn't occur later. 151562391Sdcs */ 151662391Sdcs if (newlen > g->mlen) { /* ends one */ 151762391Sdcs start = newstart; 151862391Sdcs g->mlen = newlen; 151962391Sdcs if (offset > -1) 152062391Sdcs g->moffset += offset; 152162391Sdcs else 152262391Sdcs g->moffset = offset; 152362391Sdcs } 152462391Sdcs offset = -1; 152562391Sdcs newlen = 0; 152662391Sdcs break; 15271573Srgrimes } 15281573Srgrimes } while (OP(s) != OEND); 15291573Srgrimes 153062391Sdcs if (g->mlen == 0) { /* there isn't one */ 153162391Sdcs g->moffset = -1; 15321573Srgrimes return; 153362391Sdcs } 15341573Srgrimes 15351573Srgrimes /* turn it into a character string */ 15361573Srgrimes g->must = malloc((size_t)g->mlen + 1); 15371573Srgrimes if (g->must == NULL) { /* argh; just forget it */ 15381573Srgrimes g->mlen = 0; 153962391Sdcs g->moffset = -1; 15401573Srgrimes return; 15411573Srgrimes } 15421573Srgrimes cp = g->must; 15431573Srgrimes scan = start; 1544132019Stjr memset(&mbs, 0, sizeof(mbs)); 1545132019Stjr while (cp < g->must + g->mlen) { 15461573Srgrimes while (OP(s = *scan++) != OCHAR) 15471573Srgrimes continue; 1548132019Stjr clen = wcrtomb(cp, OPND(s), &mbs); 1549132019Stjr assert(clen != (size_t)-1); 1550132019Stjr cp += clen; 15511573Srgrimes } 15521573Srgrimes assert(cp == g->must + g->mlen); 15531573Srgrimes *cp++ = '\0'; /* just on general principles */ 15541573Srgrimes} 15551573Srgrimes 15561573Srgrimes/* 155762391Sdcs - altoffset - choose biggest offset among multiple choices 1558131973Stjr == static int altoffset(sop *scan, int offset); 155962391Sdcs * 156062391Sdcs * Compute, recursively if necessary, the largest offset among multiple 156162391Sdcs * re paths. 156262391Sdcs */ 156362391Sdcsstatic int 1564170528Sdelphijaltoffset(sop *scan, int offset) 156562391Sdcs{ 156662391Sdcs int largest; 156762391Sdcs int try; 156862391Sdcs sop s; 156962391Sdcs 157062391Sdcs /* If we gave up already on offsets, return */ 157162391Sdcs if (offset == -1) 157262391Sdcs return -1; 157362391Sdcs 157462391Sdcs largest = 0; 157562391Sdcs try = 0; 157662391Sdcs s = *scan++; 157762391Sdcs while (OP(s) != O_QUEST && OP(s) != O_CH) { 157862391Sdcs switch (OP(s)) { 157962391Sdcs case OOR1: 158062391Sdcs if (try > largest) 158162391Sdcs largest = try; 158262391Sdcs try = 0; 158362391Sdcs break; 158462391Sdcs case OQUEST_: 158562391Sdcs case OCH_: 1586131973Stjr try = altoffset(scan, try); 158762391Sdcs if (try == -1) 158862391Sdcs return -1; 158962391Sdcs scan--; 159062391Sdcs do { 159162391Sdcs scan += OPND(s); 159262391Sdcs s = *scan; 159362391Sdcs if (OP(s) != O_QUEST && OP(s) != O_CH && 159462391Sdcs OP(s) != OOR2) 159562391Sdcs return -1; 159662391Sdcs } while (OP(s) != O_QUEST && OP(s) != O_CH); 159762855Sdcs /* We must skip to the next position, or we'll 159862855Sdcs * leave altoffset() too early. 159962855Sdcs */ 160062855Sdcs scan++; 160162391Sdcs break; 160262391Sdcs case OANYOF: 160362391Sdcs case OCHAR: 160462391Sdcs case OANY: 160562391Sdcs try++; 160662391Sdcs case OBOW: 160762391Sdcs case OEOW: 160862391Sdcs case OLPAREN: 160962391Sdcs case ORPAREN: 161062391Sdcs case OOR2: 161162391Sdcs break; 161262391Sdcs default: 161362391Sdcs try = -1; 161462391Sdcs break; 161562391Sdcs } 161662391Sdcs if (try == -1) 161762391Sdcs return -1; 161862391Sdcs s = *scan++; 161962391Sdcs } 162062391Sdcs 162162391Sdcs if (try > largest) 162262391Sdcs largest = try; 162362391Sdcs 162462391Sdcs return largest+offset; 162562391Sdcs} 162662391Sdcs 162762391Sdcs/* 162862232Sdcs - computejumps - compute char jumps for BM scan 162992889Sobrien == static void computejumps(struct parse *p, struct re_guts *g); 163062232Sdcs * 163162232Sdcs * This algorithm assumes g->must exists and is has size greater than 163262232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 163362232Sdcs * Sara Baase. 163462232Sdcs * 163562232Sdcs * A char jump is the number of characters one needs to jump based on 163662232Sdcs * the value of the character from the text that was mismatched. 163762232Sdcs */ 163862232Sdcsstatic void 1639170528Sdelphijcomputejumps(struct parse *p, struct re_guts *g) 164062232Sdcs{ 164162232Sdcs int ch; 164262232Sdcs int mindex; 164362232Sdcs 164462232Sdcs /* Avoid making errors worse */ 164562232Sdcs if (p->error != 0) 164662232Sdcs return; 164762232Sdcs 164862848Sdcs g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 164962232Sdcs if (g->charjump == NULL) /* Not a fatal error */ 165062232Sdcs return; 165162754Sdcs /* Adjust for signed chars, if necessary */ 165262754Sdcs g->charjump = &g->charjump[-(CHAR_MIN)]; 165362232Sdcs 165462232Sdcs /* If the character does not exist in the pattern, the jump 165562232Sdcs * is equal to the number of characters in the pattern. 165662232Sdcs */ 165762754Sdcs for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 165862232Sdcs g->charjump[ch] = g->mlen; 165962232Sdcs 166062232Sdcs /* If the character does exist, compute the jump that would 166162232Sdcs * take us to the last character in the pattern equal to it 166262232Sdcs * (notice that we match right to left, so that last character 166362232Sdcs * is the first one that would be matched). 166462232Sdcs */ 166562232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 1666111010Snectar g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 166762232Sdcs} 166862232Sdcs 166962232Sdcs/* 167062232Sdcs - computematchjumps - compute match jumps for BM scan 167192889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g); 167262232Sdcs * 167362232Sdcs * This algorithm assumes g->must exists and is has size greater than 167462232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 167562232Sdcs * Sara Baase. 167662232Sdcs * 167762232Sdcs * A match jump is the number of characters one needs to advance based 167862232Sdcs * on the already-matched suffix. 167962232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way 168062232Sdcs * the search algorithm works. 168162232Sdcs */ 168262232Sdcsstatic void 1683170528Sdelphijcomputematchjumps(struct parse *p, struct re_guts *g) 168462232Sdcs{ 168562232Sdcs int mindex; /* General "must" iterator */ 168662232Sdcs int suffix; /* Keeps track of matching suffix */ 168762232Sdcs int ssuffix; /* Keeps track of suffixes' suffix */ 168862232Sdcs int* pmatches; /* pmatches[k] points to the next i 168962232Sdcs * such that i+1...mlen is a substring 169062232Sdcs * of k+1...k+mlen-i-1 169162232Sdcs */ 169262232Sdcs 169362232Sdcs /* Avoid making errors worse */ 169462232Sdcs if (p->error != 0) 169562232Sdcs return; 169662232Sdcs 169762848Sdcs pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 169862232Sdcs if (pmatches == NULL) { 169962232Sdcs g->matchjump = NULL; 170062232Sdcs return; 170162232Sdcs } 170262232Sdcs 170362848Sdcs g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 170462232Sdcs if (g->matchjump == NULL) /* Not a fatal error */ 170562232Sdcs return; 170662232Sdcs 170762232Sdcs /* Set maximum possible jump for each character in the pattern */ 170862232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 170962232Sdcs g->matchjump[mindex] = 2*g->mlen - mindex - 1; 171062232Sdcs 171162232Sdcs /* Compute pmatches[] */ 171262232Sdcs for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 171362232Sdcs mindex--, suffix--) { 171462232Sdcs pmatches[mindex] = suffix; 171562232Sdcs 171662232Sdcs /* If a mismatch is found, interrupting the substring, 171762232Sdcs * compute the matchjump for that position. If no 171862232Sdcs * mismatch is found, then a text substring mismatched 171962232Sdcs * against the suffix will also mismatch against the 172062232Sdcs * substring. 172162232Sdcs */ 172262232Sdcs while (suffix < g->mlen 172362232Sdcs && g->must[mindex] != g->must[suffix]) { 172462232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 172562232Sdcs g->mlen - mindex - 1); 172662232Sdcs suffix = pmatches[suffix]; 172762232Sdcs } 172862232Sdcs } 172962232Sdcs 173062232Sdcs /* Compute the matchjump up to the last substring found to jump 173162232Sdcs * to the beginning of the largest must pattern prefix matching 173262232Sdcs * it's own suffix. 173362232Sdcs */ 173462232Sdcs for (mindex = 0; mindex <= suffix; mindex++) 173562232Sdcs g->matchjump[mindex] = MIN(g->matchjump[mindex], 173662232Sdcs g->mlen + suffix - mindex); 173762232Sdcs 173862232Sdcs ssuffix = pmatches[suffix]; 173962391Sdcs while (suffix < g->mlen) { 174062673Sdcs while (suffix <= ssuffix && suffix < g->mlen) { 174162232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 174262232Sdcs g->mlen + ssuffix - suffix); 174362232Sdcs suffix++; 174462232Sdcs } 174586208Sdcs if (suffix < g->mlen) 174686208Sdcs ssuffix = pmatches[ssuffix]; 174762232Sdcs } 174862232Sdcs 174962232Sdcs free(pmatches); 175062232Sdcs} 175162232Sdcs 175262232Sdcs/* 17531573Srgrimes - pluscount - count + nesting 175492889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g); 17551573Srgrimes */ 17561573Srgrimesstatic sopno /* nesting depth */ 1757170528Sdelphijpluscount(struct parse *p, struct re_guts *g) 17581573Srgrimes{ 175992889Sobrien sop *scan; 176092889Sobrien sop s; 176192889Sobrien sopno plusnest = 0; 176292889Sobrien sopno maxnest = 0; 17631573Srgrimes 17641573Srgrimes if (p->error != 0) 17651573Srgrimes return(0); /* there may not be an OEND */ 17661573Srgrimes 17671573Srgrimes scan = g->strip + 1; 17681573Srgrimes do { 17691573Srgrimes s = *scan++; 17701573Srgrimes switch (OP(s)) { 17711573Srgrimes case OPLUS_: 17721573Srgrimes plusnest++; 17731573Srgrimes break; 17741573Srgrimes case O_PLUS: 17751573Srgrimes if (plusnest > maxnest) 17761573Srgrimes maxnest = plusnest; 17771573Srgrimes plusnest--; 17781573Srgrimes break; 17791573Srgrimes } 17801573Srgrimes } while (OP(s) != OEND); 17811573Srgrimes if (plusnest != 0) 17821573Srgrimes g->iflags |= BAD; 17831573Srgrimes return(maxnest); 17841573Srgrimes} 1785