regcomp.c revision 136091
11573Srgrimes/*- 21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer. 31573Srgrimes * Copyright (c) 1992, 1993, 1994 41573Srgrimes * The Regents of the University of California. All rights reserved. 51573Srgrimes * 61573Srgrimes * This code is derived from software contributed to Berkeley by 71573Srgrimes * Henry Spencer. 81573Srgrimes * 91573Srgrimes * Redistribution and use in source and binary forms, with or without 101573Srgrimes * modification, are permitted provided that the following conditions 111573Srgrimes * are met: 121573Srgrimes * 1. Redistributions of source code must retain the above copyright 131573Srgrimes * notice, this list of conditions and the following disclaimer. 141573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 151573Srgrimes * notice, this list of conditions and the following disclaimer in the 161573Srgrimes * documentation and/or other materials provided with the distribution. 171573Srgrimes * 3. All advertising materials mentioning features or use of this software 181573Srgrimes * must display the following acknowledgement: 191573Srgrimes * This product includes software developed by the University of 201573Srgrimes * California, Berkeley and its contributors. 211573Srgrimes * 4. Neither the name of the University nor the names of its contributors 221573Srgrimes * may be used to endorse or promote products derived from this software 231573Srgrimes * without specific prior written permission. 241573Srgrimes * 251573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 261573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 271573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 281573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 291573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 301573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 311573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 321573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 331573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 341573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 351573Srgrimes * SUCH DAMAGE. 361573Srgrimes * 371573Srgrimes * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 381573Srgrimes */ 391573Srgrimes 401573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 411573Srgrimesstatic char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 421573Srgrimes#endif /* LIBC_SCCS and not lint */ 4392986Sobrien#include <sys/cdefs.h> 4492986Sobrien__FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 136091 2004-10-03 15:42:59Z stefanf $"); 451573Srgrimes 461573Srgrimes#include <sys/types.h> 471573Srgrimes#include <stdio.h> 481573Srgrimes#include <string.h> 491573Srgrimes#include <ctype.h> 501573Srgrimes#include <limits.h> 511573Srgrimes#include <stdlib.h> 521573Srgrimes#include <regex.h> 53136091Sstefanf#include <runetype.h> 54132019Stjr#include <wchar.h> 55132019Stjr#include <wctype.h> 561573Srgrimes 5719277Sache#include "collate.h" 5819277Sache 591573Srgrimes#include "utils.h" 601573Srgrimes#include "regex2.h" 611573Srgrimes 621573Srgrimes#include "cname.h" 631573Srgrimes 641573Srgrimes/* 651573Srgrimes * parse structure, passed up and down to avoid global variables and 661573Srgrimes * other clumsinesses 671573Srgrimes */ 681573Srgrimesstruct parse { 691573Srgrimes char *next; /* next character in RE */ 701573Srgrimes char *end; /* end of string (-> NUL normally) */ 711573Srgrimes int error; /* has an error been seen? */ 721573Srgrimes sop *strip; /* malloced strip */ 731573Srgrimes sopno ssize; /* malloced strip size (allocated) */ 741573Srgrimes sopno slen; /* malloced strip length (used) */ 751573Srgrimes int ncsalloc; /* number of csets allocated */ 761573Srgrimes struct re_guts *g; 771573Srgrimes# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 781573Srgrimes sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 791573Srgrimes sopno pend[NPAREN]; /* -> ) ([0] unused) */ 801573Srgrimes}; 811573Srgrimes 821573Srgrimes/* ========= begin header generated by ./mkh ========= */ 831573Srgrimes#ifdef __cplusplus 841573Srgrimesextern "C" { 851573Srgrimes#endif 861573Srgrimes 871573Srgrimes/* === regcomp.c === */ 88132019Stjrstatic void p_ere(struct parse *p, wint_t stop); 8992905Sobrienstatic void p_ere_exp(struct parse *p); 9092905Sobrienstatic void p_str(struct parse *p); 91132019Stjrstatic void p_bre(struct parse *p, wint_t end1, wint_t end2); 9292905Sobrienstatic int p_simp_re(struct parse *p, int starordinary); 9392905Sobrienstatic int p_count(struct parse *p); 9492905Sobrienstatic void p_bracket(struct parse *p); 9592905Sobrienstatic void p_b_term(struct parse *p, cset *cs); 9692905Sobrienstatic void p_b_cclass(struct parse *p, cset *cs); 9792905Sobrienstatic void p_b_eclass(struct parse *p, cset *cs); 98132019Stjrstatic wint_t p_b_symbol(struct parse *p); 99132019Stjrstatic wint_t p_b_coll_elem(struct parse *p, wint_t endc); 100132019Stjrstatic wint_t othercase(wint_t ch); 101132019Stjrstatic void bothcases(struct parse *p, wint_t ch); 102132019Stjrstatic void ordinary(struct parse *p, wint_t ch); 10392905Sobrienstatic void nonnewline(struct parse *p); 10492905Sobrienstatic void repeat(struct parse *p, sopno start, int from, int to); 10592905Sobrienstatic int seterr(struct parse *p, int e); 10692905Sobrienstatic cset *allocset(struct parse *p); 10792905Sobrienstatic void freeset(struct parse *p, cset *cs); 108132019Stjrstatic void CHadd(struct parse *p, cset *cs, wint_t ch); 109132019Stjrstatic void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 110132019Stjrstatic void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 111132019Stjrstatic wint_t singleton(cset *cs); 11292905Sobrienstatic sopno dupl(struct parse *p, sopno start, sopno finish); 11392905Sobrienstatic void doemit(struct parse *p, sop op, size_t opnd); 11492905Sobrienstatic void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 11592905Sobrienstatic void dofwd(struct parse *p, sopno pos, sop value); 11692905Sobrienstatic void enlarge(struct parse *p, sopno size); 11792905Sobrienstatic void stripsnug(struct parse *p, struct re_guts *g); 11892905Sobrienstatic void findmust(struct parse *p, struct re_guts *g); 119131973Stjrstatic int altoffset(sop *scan, int offset); 12092905Sobrienstatic void computejumps(struct parse *p, struct re_guts *g); 12192905Sobrienstatic void computematchjumps(struct parse *p, struct re_guts *g); 12292905Sobrienstatic sopno pluscount(struct parse *p, struct re_guts *g); 123132019Stjrstatic wint_t wgetnext(struct parse *p); 1241573Srgrimes 1251573Srgrimes#ifdef __cplusplus 1261573Srgrimes} 1271573Srgrimes#endif 1281573Srgrimes/* ========= end header generated by ./mkh ========= */ 1291573Srgrimes 1301573Srgrimesstatic char nuls[10]; /* place to point scanner in event of error */ 1311573Srgrimes 1321573Srgrimes/* 1331573Srgrimes * macros for use with parse structure 1341573Srgrimes * BEWARE: these know that the parse structure is named `p' !!! 1351573Srgrimes */ 1361573Srgrimes#define PEEK() (*p->next) 1371573Srgrimes#define PEEK2() (*(p->next+1)) 1381573Srgrimes#define MORE() (p->next < p->end) 1391573Srgrimes#define MORE2() (p->next+1 < p->end) 1401573Srgrimes#define SEE(c) (MORE() && PEEK() == (c)) 1411573Srgrimes#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 1421573Srgrimes#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 1431573Srgrimes#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 1441573Srgrimes#define NEXT() (p->next++) 1451573Srgrimes#define NEXT2() (p->next += 2) 1461573Srgrimes#define NEXTn(n) (p->next += (n)) 1471573Srgrimes#define GETNEXT() (*p->next++) 148132019Stjr#define WGETNEXT() wgetnext(p) 1491573Srgrimes#define SETERROR(e) seterr(p, (e)) 1501573Srgrimes#define REQUIRE(co, e) ((co) || SETERROR(e)) 1511573Srgrimes#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 1521573Srgrimes#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 1531573Srgrimes#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 1541573Srgrimes#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 1551573Srgrimes#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 1561573Srgrimes#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 1571573Srgrimes#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 1581573Srgrimes#define HERE() (p->slen) 1591573Srgrimes#define THERE() (p->slen - 1) 1601573Srgrimes#define THERETHERE() (p->slen - 2) 1611573Srgrimes#define DROP(n) (p->slen -= (n)) 1621573Srgrimes 1631573Srgrimes#ifndef NDEBUG 1641573Srgrimesstatic int never = 0; /* for use in asserts; shuts lint up */ 1651573Srgrimes#else 1661573Srgrimes#define never 0 /* some <assert.h>s have bugs too */ 1671573Srgrimes#endif 1681573Srgrimes 16962232Sdcs/* Macro used by computejump()/computematchjump() */ 17062232Sdcs#define MIN(a,b) ((a)<(b)?(a):(b)) 17162232Sdcs 1721573Srgrimes/* 1731573Srgrimes - regcomp - interface for parser and compilation 1741573Srgrimes = extern int regcomp(regex_t *, const char *, int); 1751573Srgrimes = #define REG_BASIC 0000 1761573Srgrimes = #define REG_EXTENDED 0001 1771573Srgrimes = #define REG_ICASE 0002 1781573Srgrimes = #define REG_NOSUB 0004 1791573Srgrimes = #define REG_NEWLINE 0010 1801573Srgrimes = #define REG_NOSPEC 0020 1811573Srgrimes = #define REG_PEND 0040 1821573Srgrimes = #define REG_DUMP 0200 1831573Srgrimes */ 1841573Srgrimesint /* 0 success, otherwise REG_something */ 1851573Srgrimesregcomp(preg, pattern, cflags) 186104358Smikeregex_t * __restrict preg; 187104358Smikeconst char * __restrict pattern; 1881573Srgrimesint cflags; 1891573Srgrimes{ 1901573Srgrimes struct parse pa; 19192889Sobrien struct re_guts *g; 19292889Sobrien struct parse *p = &pa; 19392889Sobrien int i; 19492889Sobrien size_t len; 1951573Srgrimes#ifdef REDEBUG 1961573Srgrimes# define GOODFLAGS(f) (f) 1971573Srgrimes#else 1981573Srgrimes# define GOODFLAGS(f) ((f)&~REG_DUMP) 1991573Srgrimes#endif 2001573Srgrimes 2011573Srgrimes cflags = GOODFLAGS(cflags); 2021573Srgrimes if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 2031573Srgrimes return(REG_INVARG); 2041573Srgrimes 2051573Srgrimes if (cflags®_PEND) { 2061573Srgrimes if (preg->re_endp < pattern) 2071573Srgrimes return(REG_INVARG); 2081573Srgrimes len = preg->re_endp - pattern; 2091573Srgrimes } else 2101573Srgrimes len = strlen((char *)pattern); 2111573Srgrimes 2121573Srgrimes /* do the mallocs early so failure handling is easy */ 213131973Stjr g = (struct re_guts *)malloc(sizeof(struct re_guts)); 2141573Srgrimes if (g == NULL) 2151573Srgrimes return(REG_ESPACE); 2161573Srgrimes p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 2171573Srgrimes p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 2181573Srgrimes p->slen = 0; 2191573Srgrimes if (p->strip == NULL) { 2201573Srgrimes free((char *)g); 2211573Srgrimes return(REG_ESPACE); 2221573Srgrimes } 2231573Srgrimes 2241573Srgrimes /* set things up */ 2251573Srgrimes p->g = g; 2261573Srgrimes p->next = (char *)pattern; /* convenience; we do not modify it */ 2271573Srgrimes p->end = p->next + len; 2281573Srgrimes p->error = 0; 2291573Srgrimes p->ncsalloc = 0; 2301573Srgrimes for (i = 0; i < NPAREN; i++) { 2311573Srgrimes p->pbegin[i] = 0; 2321573Srgrimes p->pend[i] = 0; 2331573Srgrimes } 2341573Srgrimes g->sets = NULL; 2351573Srgrimes g->ncsets = 0; 2361573Srgrimes g->cflags = cflags; 2371573Srgrimes g->iflags = 0; 2381573Srgrimes g->nbol = 0; 2391573Srgrimes g->neol = 0; 2401573Srgrimes g->must = NULL; 24162391Sdcs g->moffset = -1; 24262263Sdcs g->charjump = NULL; 24362263Sdcs g->matchjump = NULL; 2441573Srgrimes g->mlen = 0; 2451573Srgrimes g->nsub = 0; 2461573Srgrimes g->backrefs = 0; 2471573Srgrimes 2481573Srgrimes /* do it */ 2491573Srgrimes EMIT(OEND, 0); 2501573Srgrimes g->firststate = THERE(); 2511573Srgrimes if (cflags®_EXTENDED) 2521573Srgrimes p_ere(p, OUT); 2531573Srgrimes else if (cflags®_NOSPEC) 2541573Srgrimes p_str(p); 2551573Srgrimes else 2561573Srgrimes p_bre(p, OUT, OUT); 2571573Srgrimes EMIT(OEND, 0); 2581573Srgrimes g->laststate = THERE(); 2591573Srgrimes 2601573Srgrimes /* tidy up loose ends and fill things in */ 2611573Srgrimes stripsnug(p, g); 2621573Srgrimes findmust(p, g); 26362232Sdcs /* only use Boyer-Moore algorithm if the pattern is bigger 26462232Sdcs * than three characters 26562232Sdcs */ 26662232Sdcs if(g->mlen > 3) { 26762232Sdcs computejumps(p, g); 26862232Sdcs computematchjumps(p, g); 26962755Sdcs if(g->matchjump == NULL && g->charjump != NULL) { 27062232Sdcs free(g->charjump); 27162232Sdcs g->charjump = NULL; 27262232Sdcs } 27362232Sdcs } 2741573Srgrimes g->nplus = pluscount(p, g); 2751573Srgrimes g->magic = MAGIC2; 2761573Srgrimes preg->re_nsub = g->nsub; 2771573Srgrimes preg->re_g = g; 2781573Srgrimes preg->re_magic = MAGIC1; 2791573Srgrimes#ifndef REDEBUG 2801573Srgrimes /* not debugging, so can't rely on the assert() in regexec() */ 2811573Srgrimes if (g->iflags&BAD) 2821573Srgrimes SETERROR(REG_ASSERT); 2831573Srgrimes#endif 2841573Srgrimes 2851573Srgrimes /* win or lose, we're done */ 2861573Srgrimes if (p->error != 0) /* lose */ 2871573Srgrimes regfree(preg); 2881573Srgrimes return(p->error); 2891573Srgrimes} 2901573Srgrimes 2911573Srgrimes/* 2921573Srgrimes - p_ere - ERE parser top level, concatenation and alternation 29392889Sobrien == static void p_ere(struct parse *p, int stop); 2941573Srgrimes */ 2951573Srgrimesstatic void 2961573Srgrimesp_ere(p, stop) 29792889Sobrienstruct parse *p; 2981573Srgrimesint stop; /* character this ERE should end at */ 2991573Srgrimes{ 30092889Sobrien char c; 30192889Sobrien sopno prevback; 30292889Sobrien sopno prevfwd; 30392889Sobrien sopno conc; 30492889Sobrien int first = 1; /* is this the first alternative? */ 3051573Srgrimes 3061573Srgrimes for (;;) { 3071573Srgrimes /* do a bunch of concatenated expressions */ 3081573Srgrimes conc = HERE(); 3091573Srgrimes while (MORE() && (c = PEEK()) != '|' && c != stop) 3101573Srgrimes p_ere_exp(p); 31117141Sjkh (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 3121573Srgrimes 3131573Srgrimes if (!EAT('|')) 3141573Srgrimes break; /* NOTE BREAK OUT */ 3151573Srgrimes 3161573Srgrimes if (first) { 3171573Srgrimes INSERT(OCH_, conc); /* offset is wrong */ 3181573Srgrimes prevfwd = conc; 3191573Srgrimes prevback = conc; 3201573Srgrimes first = 0; 3211573Srgrimes } 3221573Srgrimes ASTERN(OOR1, prevback); 3231573Srgrimes prevback = THERE(); 3241573Srgrimes AHEAD(prevfwd); /* fix previous offset */ 3251573Srgrimes prevfwd = HERE(); 3261573Srgrimes EMIT(OOR2, 0); /* offset is very wrong */ 3271573Srgrimes } 3281573Srgrimes 3291573Srgrimes if (!first) { /* tail-end fixups */ 3301573Srgrimes AHEAD(prevfwd); 3311573Srgrimes ASTERN(O_CH, prevback); 3321573Srgrimes } 3331573Srgrimes 3341573Srgrimes assert(!MORE() || SEE(stop)); 3351573Srgrimes} 3361573Srgrimes 3371573Srgrimes/* 3381573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 33992889Sobrien == static void p_ere_exp(struct parse *p); 3401573Srgrimes */ 3411573Srgrimesstatic void 3421573Srgrimesp_ere_exp(p) 34392889Sobrienstruct parse *p; 3441573Srgrimes{ 34592889Sobrien char c; 346132019Stjr wint_t wc; 34792889Sobrien sopno pos; 34892889Sobrien int count; 34992889Sobrien int count2; 35092889Sobrien sopno subno; 3511573Srgrimes int wascaret = 0; 3521573Srgrimes 3531573Srgrimes assert(MORE()); /* caller should have ensured this */ 3541573Srgrimes c = GETNEXT(); 3551573Srgrimes 3561573Srgrimes pos = HERE(); 3571573Srgrimes switch (c) { 3581573Srgrimes case '(': 35917141Sjkh (void)REQUIRE(MORE(), REG_EPAREN); 3601573Srgrimes p->g->nsub++; 3611573Srgrimes subno = p->g->nsub; 3621573Srgrimes if (subno < NPAREN) 3631573Srgrimes p->pbegin[subno] = HERE(); 3641573Srgrimes EMIT(OLPAREN, subno); 3651573Srgrimes if (!SEE(')')) 3661573Srgrimes p_ere(p, ')'); 3671573Srgrimes if (subno < NPAREN) { 3681573Srgrimes p->pend[subno] = HERE(); 3691573Srgrimes assert(p->pend[subno] != 0); 3701573Srgrimes } 3711573Srgrimes EMIT(ORPAREN, subno); 37217141Sjkh (void)MUSTEAT(')', REG_EPAREN); 3731573Srgrimes break; 3741573Srgrimes#ifndef POSIX_MISTAKE 3751573Srgrimes case ')': /* happens only if no current unmatched ( */ 3761573Srgrimes /* 3771573Srgrimes * You may ask, why the ifndef? Because I didn't notice 3781573Srgrimes * this until slightly too late for 1003.2, and none of the 3791573Srgrimes * other 1003.2 regular-expression reviewers noticed it at 3801573Srgrimes * all. So an unmatched ) is legal POSIX, at least until 3811573Srgrimes * we can get it fixed. 3821573Srgrimes */ 3831573Srgrimes SETERROR(REG_EPAREN); 3841573Srgrimes break; 3851573Srgrimes#endif 3861573Srgrimes case '^': 3871573Srgrimes EMIT(OBOL, 0); 3881573Srgrimes p->g->iflags |= USEBOL; 3891573Srgrimes p->g->nbol++; 3901573Srgrimes wascaret = 1; 3911573Srgrimes break; 3921573Srgrimes case '$': 3931573Srgrimes EMIT(OEOL, 0); 3941573Srgrimes p->g->iflags |= USEEOL; 3951573Srgrimes p->g->neol++; 3961573Srgrimes break; 3971573Srgrimes case '|': 3981573Srgrimes SETERROR(REG_EMPTY); 3991573Srgrimes break; 4001573Srgrimes case '*': 4011573Srgrimes case '+': 4021573Srgrimes case '?': 4031573Srgrimes SETERROR(REG_BADRPT); 4041573Srgrimes break; 4051573Srgrimes case '.': 4061573Srgrimes if (p->g->cflags®_NEWLINE) 4071573Srgrimes nonnewline(p); 4081573Srgrimes else 4091573Srgrimes EMIT(OANY, 0); 4101573Srgrimes break; 4111573Srgrimes case '[': 4121573Srgrimes p_bracket(p); 4131573Srgrimes break; 4141573Srgrimes case '\\': 41517141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 416132019Stjr wc = WGETNEXT(); 417132019Stjr ordinary(p, wc); 4181573Srgrimes break; 4191573Srgrimes case '{': /* okay as ordinary except if digit follows */ 42049094Sache (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 4211573Srgrimes /* FALLTHROUGH */ 4221573Srgrimes default: 423132019Stjr p->next--; 424132019Stjr wc = WGETNEXT(); 425132019Stjr ordinary(p, wc); 4261573Srgrimes break; 4271573Srgrimes } 4281573Srgrimes 4291573Srgrimes if (!MORE()) 4301573Srgrimes return; 4311573Srgrimes c = PEEK(); 4321573Srgrimes /* we call { a repetition if followed by a digit */ 4331573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 43449094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 4351573Srgrimes return; /* no repetition, we're done */ 4361573Srgrimes NEXT(); 4371573Srgrimes 43817141Sjkh (void)REQUIRE(!wascaret, REG_BADRPT); 4391573Srgrimes switch (c) { 4401573Srgrimes case '*': /* implemented as +? */ 4411573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 4421573Srgrimes INSERT(OPLUS_, pos); 4431573Srgrimes ASTERN(O_PLUS, pos); 4441573Srgrimes INSERT(OQUEST_, pos); 4451573Srgrimes ASTERN(O_QUEST, pos); 4461573Srgrimes break; 4471573Srgrimes case '+': 4481573Srgrimes INSERT(OPLUS_, pos); 4491573Srgrimes ASTERN(O_PLUS, pos); 4501573Srgrimes break; 4511573Srgrimes case '?': 4521573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 4531573Srgrimes INSERT(OCH_, pos); /* offset slightly wrong */ 4541573Srgrimes ASTERN(OOR1, pos); /* this one's right */ 4551573Srgrimes AHEAD(pos); /* fix the OCH_ */ 4561573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 4571573Srgrimes AHEAD(THERE()); /* ...so fix it */ 4581573Srgrimes ASTERN(O_CH, THERETHERE()); 4591573Srgrimes break; 4601573Srgrimes case '{': 4611573Srgrimes count = p_count(p); 4621573Srgrimes if (EAT(',')) { 46349094Sache if (isdigit((uch)PEEK())) { 4641573Srgrimes count2 = p_count(p); 46517141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 4661573Srgrimes } else /* single number with comma */ 4671573Srgrimes count2 = INFINITY; 4681573Srgrimes } else /* just a single number */ 4691573Srgrimes count2 = count; 4701573Srgrimes repeat(p, pos, count, count2); 4711573Srgrimes if (!EAT('}')) { /* error heuristics */ 4721573Srgrimes while (MORE() && PEEK() != '}') 4731573Srgrimes NEXT(); 47417141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 4751573Srgrimes SETERROR(REG_BADBR); 4761573Srgrimes } 4771573Srgrimes break; 4781573Srgrimes } 4791573Srgrimes 4801573Srgrimes if (!MORE()) 4811573Srgrimes return; 4821573Srgrimes c = PEEK(); 4831573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 48449094Sache (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 4851573Srgrimes return; 4861573Srgrimes SETERROR(REG_BADRPT); 4871573Srgrimes} 4881573Srgrimes 4891573Srgrimes/* 4901573Srgrimes - p_str - string (no metacharacters) "parser" 49192889Sobrien == static void p_str(struct parse *p); 4921573Srgrimes */ 4931573Srgrimesstatic void 4941573Srgrimesp_str(p) 49592889Sobrienstruct parse *p; 4961573Srgrimes{ 49717141Sjkh (void)REQUIRE(MORE(), REG_EMPTY); 4981573Srgrimes while (MORE()) 499132019Stjr ordinary(p, WGETNEXT()); 5001573Srgrimes} 5011573Srgrimes 5021573Srgrimes/* 5031573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation 50492889Sobrien == static void p_bre(struct parse *p, int end1, \ 50592889Sobrien == int end2); 5061573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check. 5071573Srgrimes * 5081573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first 509131973Stjr * taken as an ordinary character and then revised to be an anchor. 5101573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive. 5111573Srgrimes */ 5121573Srgrimesstatic void 5131573Srgrimesp_bre(p, end1, end2) 51492889Sobrienstruct parse *p; 51592889Sobrienint end1; /* first terminating character */ 51692889Sobrienint end2; /* second terminating character */ 5171573Srgrimes{ 51892889Sobrien sopno start = HERE(); 51992889Sobrien int first = 1; /* first subexpression? */ 52092889Sobrien int wasdollar = 0; 5211573Srgrimes 5221573Srgrimes if (EAT('^')) { 5231573Srgrimes EMIT(OBOL, 0); 5241573Srgrimes p->g->iflags |= USEBOL; 5251573Srgrimes p->g->nbol++; 5261573Srgrimes } 5271573Srgrimes while (MORE() && !SEETWO(end1, end2)) { 5281573Srgrimes wasdollar = p_simp_re(p, first); 5291573Srgrimes first = 0; 5301573Srgrimes } 5311573Srgrimes if (wasdollar) { /* oops, that was a trailing anchor */ 5321573Srgrimes DROP(1); 5331573Srgrimes EMIT(OEOL, 0); 5341573Srgrimes p->g->iflags |= USEEOL; 5351573Srgrimes p->g->neol++; 5361573Srgrimes } 5371573Srgrimes 53817141Sjkh (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 5391573Srgrimes} 5401573Srgrimes 5411573Srgrimes/* 5421573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 54392889Sobrien == static int p_simp_re(struct parse *p, int starordinary); 5441573Srgrimes */ 5451573Srgrimesstatic int /* was the simple RE an unbackslashed $? */ 5461573Srgrimesp_simp_re(p, starordinary) 54792889Sobrienstruct parse *p; 5481573Srgrimesint starordinary; /* is a leading * an ordinary character? */ 5491573Srgrimes{ 55092889Sobrien int c; 55192889Sobrien int count; 55292889Sobrien int count2; 55392889Sobrien sopno pos; 55492889Sobrien int i; 555132019Stjr wint_t wc; 55692889Sobrien sopno subno; 5571573Srgrimes# define BACKSL (1<<CHAR_BIT) 5581573Srgrimes 5591573Srgrimes pos = HERE(); /* repetion op, if any, covers from here */ 5601573Srgrimes 5611573Srgrimes assert(MORE()); /* caller should have ensured this */ 5621573Srgrimes c = GETNEXT(); 5631573Srgrimes if (c == '\\') { 56417141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 56549094Sache c = BACKSL | GETNEXT(); 5661573Srgrimes } 5671573Srgrimes switch (c) { 5681573Srgrimes case '.': 5691573Srgrimes if (p->g->cflags®_NEWLINE) 5701573Srgrimes nonnewline(p); 5711573Srgrimes else 5721573Srgrimes EMIT(OANY, 0); 5731573Srgrimes break; 5741573Srgrimes case '[': 5751573Srgrimes p_bracket(p); 5761573Srgrimes break; 5771573Srgrimes case BACKSL|'{': 5781573Srgrimes SETERROR(REG_BADRPT); 5791573Srgrimes break; 5801573Srgrimes case BACKSL|'(': 5811573Srgrimes p->g->nsub++; 5821573Srgrimes subno = p->g->nsub; 5831573Srgrimes if (subno < NPAREN) 5841573Srgrimes p->pbegin[subno] = HERE(); 5851573Srgrimes EMIT(OLPAREN, subno); 5861573Srgrimes /* the MORE here is an error heuristic */ 5871573Srgrimes if (MORE() && !SEETWO('\\', ')')) 5881573Srgrimes p_bre(p, '\\', ')'); 5891573Srgrimes if (subno < NPAREN) { 5901573Srgrimes p->pend[subno] = HERE(); 5911573Srgrimes assert(p->pend[subno] != 0); 5921573Srgrimes } 5931573Srgrimes EMIT(ORPAREN, subno); 59417141Sjkh (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 5951573Srgrimes break; 5961573Srgrimes case BACKSL|')': /* should not get here -- must be user */ 5971573Srgrimes case BACKSL|'}': 5981573Srgrimes SETERROR(REG_EPAREN); 5991573Srgrimes break; 6001573Srgrimes case BACKSL|'1': 6011573Srgrimes case BACKSL|'2': 6021573Srgrimes case BACKSL|'3': 6031573Srgrimes case BACKSL|'4': 6041573Srgrimes case BACKSL|'5': 6051573Srgrimes case BACKSL|'6': 6061573Srgrimes case BACKSL|'7': 6071573Srgrimes case BACKSL|'8': 6081573Srgrimes case BACKSL|'9': 6091573Srgrimes i = (c&~BACKSL) - '0'; 6101573Srgrimes assert(i < NPAREN); 6111573Srgrimes if (p->pend[i] != 0) { 6121573Srgrimes assert(i <= p->g->nsub); 6131573Srgrimes EMIT(OBACK_, i); 6141573Srgrimes assert(p->pbegin[i] != 0); 6151573Srgrimes assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 6161573Srgrimes assert(OP(p->strip[p->pend[i]]) == ORPAREN); 6171573Srgrimes (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 6181573Srgrimes EMIT(O_BACK, i); 6191573Srgrimes } else 6201573Srgrimes SETERROR(REG_ESUBREG); 6211573Srgrimes p->g->backrefs = 1; 6221573Srgrimes break; 6231573Srgrimes case '*': 62417141Sjkh (void)REQUIRE(starordinary, REG_BADRPT); 6251573Srgrimes /* FALLTHROUGH */ 6261573Srgrimes default: 627132019Stjr p->next--; 628132019Stjr wc = WGETNEXT(); 629132019Stjr ordinary(p, wc); 6301573Srgrimes break; 6311573Srgrimes } 6321573Srgrimes 6331573Srgrimes if (EAT('*')) { /* implemented as +? */ 6341573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 6351573Srgrimes INSERT(OPLUS_, pos); 6361573Srgrimes ASTERN(O_PLUS, pos); 6371573Srgrimes INSERT(OQUEST_, pos); 6381573Srgrimes ASTERN(O_QUEST, pos); 6391573Srgrimes } else if (EATTWO('\\', '{')) { 6401573Srgrimes count = p_count(p); 6411573Srgrimes if (EAT(',')) { 64249094Sache if (MORE() && isdigit((uch)PEEK())) { 6431573Srgrimes count2 = p_count(p); 64417141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 6451573Srgrimes } else /* single number with comma */ 6461573Srgrimes count2 = INFINITY; 6471573Srgrimes } else /* just a single number */ 6481573Srgrimes count2 = count; 6491573Srgrimes repeat(p, pos, count, count2); 6501573Srgrimes if (!EATTWO('\\', '}')) { /* error heuristics */ 6511573Srgrimes while (MORE() && !SEETWO('\\', '}')) 6521573Srgrimes NEXT(); 65317141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 6541573Srgrimes SETERROR(REG_BADBR); 6551573Srgrimes } 65649094Sache } else if (c == '$') /* $ (but not \$) ends it */ 6571573Srgrimes return(1); 6581573Srgrimes 6591573Srgrimes return(0); 6601573Srgrimes} 6611573Srgrimes 6621573Srgrimes/* 6631573Srgrimes - p_count - parse a repetition count 66492889Sobrien == static int p_count(struct parse *p); 6651573Srgrimes */ 6661573Srgrimesstatic int /* the value */ 6671573Srgrimesp_count(p) 66892889Sobrienstruct parse *p; 6691573Srgrimes{ 67092889Sobrien int count = 0; 67192889Sobrien int ndigits = 0; 6721573Srgrimes 67349094Sache while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 6741573Srgrimes count = count*10 + (GETNEXT() - '0'); 6751573Srgrimes ndigits++; 6761573Srgrimes } 6771573Srgrimes 67817141Sjkh (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 6791573Srgrimes return(count); 6801573Srgrimes} 6811573Srgrimes 6821573Srgrimes/* 6831573Srgrimes - p_bracket - parse a bracketed character list 68492889Sobrien == static void p_bracket(struct parse *p); 6851573Srgrimes */ 6861573Srgrimesstatic void 6871573Srgrimesp_bracket(p) 68892889Sobrienstruct parse *p; 6891573Srgrimes{ 690132019Stjr cset *cs; 691132019Stjr wint_t ch; 6921573Srgrimes 6931573Srgrimes /* Dept of Truly Sickening Special-Case Kludges */ 6941573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 6951573Srgrimes EMIT(OBOW, 0); 6961573Srgrimes NEXTn(6); 6971573Srgrimes return; 6981573Srgrimes } 6991573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 7001573Srgrimes EMIT(OEOW, 0); 7011573Srgrimes NEXTn(6); 7021573Srgrimes return; 7031573Srgrimes } 7041573Srgrimes 705132019Stjr if ((cs = allocset(p)) == NULL) 706132019Stjr return; 707132019Stjr 708132019Stjr if (p->g->cflags®_ICASE) 709132019Stjr cs->icase = 1; 7101573Srgrimes if (EAT('^')) 711132019Stjr cs->invert = 1; 7121573Srgrimes if (EAT(']')) 713132019Stjr CHadd(p, cs, ']'); 7141573Srgrimes else if (EAT('-')) 715132019Stjr CHadd(p, cs, '-'); 7161573Srgrimes while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 7171573Srgrimes p_b_term(p, cs); 7181573Srgrimes if (EAT('-')) 719132019Stjr CHadd(p, cs, '-'); 72017141Sjkh (void)MUSTEAT(']', REG_EBRACK); 7211573Srgrimes 7221573Srgrimes if (p->error != 0) /* don't mess things up further */ 7231573Srgrimes return; 7241573Srgrimes 725132019Stjr if (cs->invert && p->g->cflags®_NEWLINE) 726132019Stjr cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 7271573Srgrimes 728132019Stjr if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 729132019Stjr ordinary(p, ch); 7301573Srgrimes freeset(p, cs); 7311573Srgrimes } else 732132019Stjr EMIT(OANYOF, (int)(cs - p->g->sets)); 7331573Srgrimes} 7341573Srgrimes 7351573Srgrimes/* 7361573Srgrimes - p_b_term - parse one term of a bracketed character list 73792889Sobrien == static void p_b_term(struct parse *p, cset *cs); 7381573Srgrimes */ 7391573Srgrimesstatic void 7401573Srgrimesp_b_term(p, cs) 74192889Sobrienstruct parse *p; 74292889Sobriencset *cs; 7431573Srgrimes{ 74492889Sobrien char c; 745132019Stjr wint_t start, finish; 746132019Stjr wint_t i; 7471573Srgrimes 7481573Srgrimes /* classify what we've got */ 7491573Srgrimes switch ((MORE()) ? PEEK() : '\0') { 7501573Srgrimes case '[': 7511573Srgrimes c = (MORE2()) ? PEEK2() : '\0'; 7521573Srgrimes break; 7531573Srgrimes case '-': 7541573Srgrimes SETERROR(REG_ERANGE); 7551573Srgrimes return; /* NOTE RETURN */ 7561573Srgrimes break; 7571573Srgrimes default: 7581573Srgrimes c = '\0'; 7591573Srgrimes break; 7601573Srgrimes } 7611573Srgrimes 7621573Srgrimes switch (c) { 7631573Srgrimes case ':': /* character class */ 7641573Srgrimes NEXT2(); 76517141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 7661573Srgrimes c = PEEK(); 76717141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 7681573Srgrimes p_b_cclass(p, cs); 76917141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 77017141Sjkh (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 7711573Srgrimes break; 7721573Srgrimes case '=': /* equivalence class */ 7731573Srgrimes NEXT2(); 77417141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 7751573Srgrimes c = PEEK(); 77617141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 7771573Srgrimes p_b_eclass(p, cs); 77817141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 77917141Sjkh (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 7801573Srgrimes break; 7811573Srgrimes default: /* symbol, ordinary character, or range */ 7821573Srgrimes start = p_b_symbol(p); 7831573Srgrimes if (SEE('-') && MORE2() && PEEK2() != ']') { 7841573Srgrimes /* range */ 7851573Srgrimes NEXT(); 7861573Srgrimes if (EAT('-')) 7871573Srgrimes finish = '-'; 7881573Srgrimes else 7891573Srgrimes finish = p_b_symbol(p); 7901573Srgrimes } else 7911573Srgrimes finish = start; 79217514Sache if (start == finish) 793132019Stjr CHadd(p, cs, start); 79417514Sache else { 79524637Sache if (__collate_load_error) { 79624637Sache (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 797132019Stjr CHaddrange(p, cs, start, finish); 79824637Sache } else { 79924637Sache (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 800132019Stjr for (i = 0; i <= UCHAR_MAX; i++) { 80124637Sache if ( __collate_range_cmp(start, i) <= 0 80224637Sache && __collate_range_cmp(i, finish) <= 0 80324637Sache ) 804132019Stjr CHadd(p, cs, i); 80524637Sache } 80617514Sache } 80717514Sache } 8081573Srgrimes break; 8091573Srgrimes } 8101573Srgrimes} 8111573Srgrimes 8121573Srgrimes/* 8131573Srgrimes - p_b_cclass - parse a character-class name and deal with it 81492889Sobrien == static void p_b_cclass(struct parse *p, cset *cs); 8151573Srgrimes */ 8161573Srgrimesstatic void 8171573Srgrimesp_b_cclass(p, cs) 81892889Sobrienstruct parse *p; 81992889Sobriencset *cs; 8201573Srgrimes{ 82192889Sobrien char *sp = p->next; 82292889Sobrien size_t len; 823132019Stjr wctype_t wct; 824132019Stjr char clname[16]; 8251573Srgrimes 82617508Sache while (MORE() && isalpha((uch)PEEK())) 8271573Srgrimes NEXT(); 8281573Srgrimes len = p->next - sp; 829132019Stjr if (len >= sizeof(clname) - 1) { 8301573Srgrimes SETERROR(REG_ECTYPE); 8311573Srgrimes return; 8321573Srgrimes } 833132019Stjr memcpy(clname, sp, len); 834132019Stjr clname[len] = '\0'; 835132019Stjr if ((wct = wctype(clname)) == 0) { 836132019Stjr SETERROR(REG_ECTYPE); 837132019Stjr return; 83817508Sache } 839132019Stjr CHaddtype(p, cs, wct); 8401573Srgrimes} 8411573Srgrimes 8421573Srgrimes/* 8431573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it 84492889Sobrien == static void p_b_eclass(struct parse *p, cset *cs); 8451573Srgrimes * 8461573Srgrimes * This implementation is incomplete. xxx 8471573Srgrimes */ 8481573Srgrimesstatic void 8491573Srgrimesp_b_eclass(p, cs) 85092889Sobrienstruct parse *p; 85192889Sobriencset *cs; 8521573Srgrimes{ 853132019Stjr wint_t c; 8541573Srgrimes 8551573Srgrimes c = p_b_coll_elem(p, '='); 856132019Stjr CHadd(p, cs, c); 8571573Srgrimes} 8581573Srgrimes 8591573Srgrimes/* 8601573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 86192889Sobrien == static char p_b_symbol(struct parse *p); 8621573Srgrimes */ 863132019Stjrstatic wint_t /* value of symbol */ 8641573Srgrimesp_b_symbol(p) 86592889Sobrienstruct parse *p; 8661573Srgrimes{ 867132019Stjr wint_t value; 8681573Srgrimes 86917141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 8701573Srgrimes if (!EATTWO('[', '.')) 871132019Stjr return(WGETNEXT()); 8721573Srgrimes 8731573Srgrimes /* collating symbol */ 8741573Srgrimes value = p_b_coll_elem(p, '.'); 87517141Sjkh (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 8761573Srgrimes return(value); 8771573Srgrimes} 8781573Srgrimes 8791573Srgrimes/* 8801573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up 88192889Sobrien == static char p_b_coll_elem(struct parse *p, int endc); 8821573Srgrimes */ 883132019Stjrstatic wint_t /* value of collating element */ 8841573Srgrimesp_b_coll_elem(p, endc) 88592889Sobrienstruct parse *p; 886132019Stjrwint_t endc; /* name ended by endc,']' */ 8871573Srgrimes{ 88892889Sobrien char *sp = p->next; 88992889Sobrien struct cname *cp; 89092889Sobrien int len; 891132019Stjr mbstate_t mbs; 892132019Stjr wchar_t wc; 893132019Stjr size_t clen; 8941573Srgrimes 8951573Srgrimes while (MORE() && !SEETWO(endc, ']')) 8961573Srgrimes NEXT(); 8971573Srgrimes if (!MORE()) { 8981573Srgrimes SETERROR(REG_EBRACK); 8991573Srgrimes return(0); 9001573Srgrimes } 9011573Srgrimes len = p->next - sp; 9021573Srgrimes for (cp = cnames; cp->name != NULL; cp++) 9031573Srgrimes if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 9041573Srgrimes return(cp->code); /* known name */ 905132019Stjr memset(&mbs, 0, sizeof(mbs)); 906132019Stjr if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 907132019Stjr return (wc); /* single character */ 908132019Stjr else if (clen == (size_t)-1 || clen == (size_t)-2) 909132019Stjr SETERROR(REG_ILLSEQ); 910132019Stjr else 911132019Stjr SETERROR(REG_ECOLLATE); /* neither */ 9121573Srgrimes return(0); 9131573Srgrimes} 9141573Srgrimes 9151573Srgrimes/* 9161573Srgrimes - othercase - return the case counterpart of an alphabetic 9171573Srgrimes == static char othercase(int ch); 9181573Srgrimes */ 919132019Stjrstatic wint_t /* if no counterpart, return ch */ 9201573Srgrimesothercase(ch) 921132019Stjrwint_t ch; 9221573Srgrimes{ 923132019Stjr assert(iswalpha(ch)); 924132019Stjr if (iswupper(ch)) 925132019Stjr return(towlower(ch)); 926132019Stjr else if (iswlower(ch)) 927132019Stjr return(towupper(ch)); 9281573Srgrimes else /* peculiar, but could happen */ 9291573Srgrimes return(ch); 9301573Srgrimes} 9311573Srgrimes 9321573Srgrimes/* 9331573Srgrimes - bothcases - emit a dualcase version of a two-case character 93492889Sobrien == static void bothcases(struct parse *p, int ch); 9351573Srgrimes * 9361573Srgrimes * Boy, is this implementation ever a kludge... 9371573Srgrimes */ 9381573Srgrimesstatic void 9391573Srgrimesbothcases(p, ch) 94092889Sobrienstruct parse *p; 941132019Stjrwint_t ch; 9421573Srgrimes{ 94392889Sobrien char *oldnext = p->next; 94492889Sobrien char *oldend = p->end; 945132019Stjr char bracket[3 + MB_LEN_MAX]; 946132019Stjr size_t n; 947132019Stjr mbstate_t mbs; 9481573Srgrimes 9491573Srgrimes assert(othercase(ch) != ch); /* p_bracket() would recurse */ 9501573Srgrimes p->next = bracket; 951132019Stjr memset(&mbs, 0, sizeof(mbs)); 952132019Stjr n = wcrtomb(bracket, ch, &mbs); 953132019Stjr assert(n != (size_t)-1); 954132019Stjr bracket[n] = ']'; 955132019Stjr bracket[n + 1] = '\0'; 956132019Stjr p->end = bracket+n+1; 9571573Srgrimes p_bracket(p); 958132019Stjr assert(p->next == p->end); 9591573Srgrimes p->next = oldnext; 9601573Srgrimes p->end = oldend; 9611573Srgrimes} 9621573Srgrimes 9631573Srgrimes/* 9641573Srgrimes - ordinary - emit an ordinary character 96592889Sobrien == static void ordinary(struct parse *p, int ch); 9661573Srgrimes */ 9671573Srgrimesstatic void 9681573Srgrimesordinary(p, ch) 96992889Sobrienstruct parse *p; 970132019Stjrwint_t ch; 9711573Srgrimes{ 972132019Stjr cset *cs; 9731573Srgrimes 974132019Stjr if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 9751573Srgrimes bothcases(p, ch); 976132019Stjr else if ((ch & OPDMASK) == ch) 977132019Stjr EMIT(OCHAR, ch); 978132019Stjr else { 979132019Stjr /* 980132019Stjr * Kludge: character is too big to fit into an OCHAR operand. 981132019Stjr * Emit a singleton set. 982132019Stjr */ 983132019Stjr if ((cs = allocset(p)) == NULL) 984132019Stjr return; 985132019Stjr CHadd(p, cs, ch); 986132019Stjr EMIT(OANYOF, (int)(cs - p->g->sets)); 987132019Stjr } 9881573Srgrimes} 9891573Srgrimes 9901573Srgrimes/* 9911573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY 99292889Sobrien == static void nonnewline(struct parse *p); 9931573Srgrimes * 9941573Srgrimes * Boy, is this implementation ever a kludge... 9951573Srgrimes */ 9961573Srgrimesstatic void 9971573Srgrimesnonnewline(p) 99892889Sobrienstruct parse *p; 9991573Srgrimes{ 100092889Sobrien char *oldnext = p->next; 100192889Sobrien char *oldend = p->end; 10021573Srgrimes char bracket[4]; 10031573Srgrimes 10041573Srgrimes p->next = bracket; 10051573Srgrimes p->end = bracket+3; 10061573Srgrimes bracket[0] = '^'; 10071573Srgrimes bracket[1] = '\n'; 10081573Srgrimes bracket[2] = ']'; 10091573Srgrimes bracket[3] = '\0'; 10101573Srgrimes p_bracket(p); 10111573Srgrimes assert(p->next == bracket+3); 10121573Srgrimes p->next = oldnext; 10131573Srgrimes p->end = oldend; 10141573Srgrimes} 10151573Srgrimes 10161573Srgrimes/* 10171573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed 101892889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to); 10191573Srgrimes */ 10201573Srgrimesstatic void 10211573Srgrimesrepeat(p, start, from, to) 102292889Sobrienstruct parse *p; 10231573Srgrimessopno start; /* operand from here to end of strip */ 10241573Srgrimesint from; /* repeated from this number */ 10251573Srgrimesint to; /* to this number of times (maybe INFINITY) */ 10261573Srgrimes{ 102792889Sobrien sopno finish = HERE(); 10281573Srgrimes# define N 2 10291573Srgrimes# define INF 3 10301573Srgrimes# define REP(f, t) ((f)*8 + (t)) 10311573Srgrimes# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 103292889Sobrien sopno copy; 10331573Srgrimes 10341573Srgrimes if (p->error != 0) /* head off possible runaway recursion */ 10351573Srgrimes return; 10361573Srgrimes 10371573Srgrimes assert(from <= to); 10381573Srgrimes 10391573Srgrimes switch (REP(MAP(from), MAP(to))) { 10401573Srgrimes case REP(0, 0): /* must be user doing this */ 10411573Srgrimes DROP(finish-start); /* drop the operand */ 10421573Srgrimes break; 10431573Srgrimes case REP(0, 1): /* as x{1,1}? */ 10441573Srgrimes case REP(0, N): /* as x{1,n}? */ 10451573Srgrimes case REP(0, INF): /* as x{1,}? */ 10461573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10471573Srgrimes INSERT(OCH_, start); /* offset is wrong... */ 10481573Srgrimes repeat(p, start+1, 1, to); 10491573Srgrimes ASTERN(OOR1, start); 10501573Srgrimes AHEAD(start); /* ... fix it */ 10511573Srgrimes EMIT(OOR2, 0); 10521573Srgrimes AHEAD(THERE()); 10531573Srgrimes ASTERN(O_CH, THERETHERE()); 10541573Srgrimes break; 10551573Srgrimes case REP(1, 1): /* trivial case */ 10561573Srgrimes /* done */ 10571573Srgrimes break; 10581573Srgrimes case REP(1, N): /* as x?x{1,n-1} */ 10591573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10601573Srgrimes INSERT(OCH_, start); 10611573Srgrimes ASTERN(OOR1, start); 10621573Srgrimes AHEAD(start); 10631573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 10641573Srgrimes AHEAD(THERE()); /* ...so fix it */ 10651573Srgrimes ASTERN(O_CH, THERETHERE()); 10661573Srgrimes copy = dupl(p, start+1, finish+1); 10671573Srgrimes assert(copy == finish+4); 10681573Srgrimes repeat(p, copy, 1, to-1); 10691573Srgrimes break; 10701573Srgrimes case REP(1, INF): /* as x+ */ 10711573Srgrimes INSERT(OPLUS_, start); 10721573Srgrimes ASTERN(O_PLUS, start); 10731573Srgrimes break; 10741573Srgrimes case REP(N, N): /* as xx{m-1,n-1} */ 10751573Srgrimes copy = dupl(p, start, finish); 10761573Srgrimes repeat(p, copy, from-1, to-1); 10771573Srgrimes break; 10781573Srgrimes case REP(N, INF): /* as xx{n-1,INF} */ 10791573Srgrimes copy = dupl(p, start, finish); 10801573Srgrimes repeat(p, copy, from-1, to); 10811573Srgrimes break; 10821573Srgrimes default: /* "can't happen" */ 10831573Srgrimes SETERROR(REG_ASSERT); /* just in case */ 10841573Srgrimes break; 10851573Srgrimes } 10861573Srgrimes} 10871573Srgrimes 10881573Srgrimes/* 1089132019Stjr - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1090132019Stjr - character from the parse struct, signals a REG_ILLSEQ error if the 1091132019Stjr - character can't be converted. Returns the number of bytes consumed. 1092132019Stjr */ 1093132019Stjrstatic wint_t 1094132019Stjrwgetnext(p) 1095132019Stjrstruct parse *p; 1096132019Stjr{ 1097132019Stjr mbstate_t mbs; 1098132019Stjr wchar_t wc; 1099132019Stjr size_t n; 1100132019Stjr 1101132019Stjr memset(&mbs, 0, sizeof(mbs)); 1102132019Stjr n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1103132019Stjr if (n == (size_t)-1 || n == (size_t)-2) { 1104132019Stjr SETERROR(REG_ILLSEQ); 1105132019Stjr return (0); 1106132019Stjr } 1107132019Stjr if (n == 0) 1108132019Stjr n = 1; 1109132019Stjr p->next += n; 1110132019Stjr return (wc); 1111132019Stjr} 1112132019Stjr 1113132019Stjr/* 11141573Srgrimes - seterr - set an error condition 111592889Sobrien == static int seterr(struct parse *p, int e); 11161573Srgrimes */ 11171573Srgrimesstatic int /* useless but makes type checking happy */ 11181573Srgrimesseterr(p, e) 111992889Sobrienstruct parse *p; 11201573Srgrimesint e; 11211573Srgrimes{ 11221573Srgrimes if (p->error == 0) /* keep earliest error condition */ 11231573Srgrimes p->error = e; 11241573Srgrimes p->next = nuls; /* try to bring things to a halt */ 11251573Srgrimes p->end = nuls; 11261573Srgrimes return(0); /* make the return value well-defined */ 11271573Srgrimes} 11281573Srgrimes 11291573Srgrimes/* 11301573Srgrimes - allocset - allocate a set of characters for [] 113192889Sobrien == static cset *allocset(struct parse *p); 11321573Srgrimes */ 11331573Srgrimesstatic cset * 11341573Srgrimesallocset(p) 113592889Sobrienstruct parse *p; 11361573Srgrimes{ 1137132019Stjr cset *cs, *ncs; 11381573Srgrimes 1139132019Stjr ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); 1140132019Stjr if (ncs == NULL) { 1141132019Stjr SETERROR(REG_ESPACE); 1142132019Stjr return (NULL); 11431573Srgrimes } 1144132019Stjr p->g->sets = ncs; 1145132019Stjr cs = &p->g->sets[p->g->ncsets++]; 1146132019Stjr memset(cs, 0, sizeof(*cs)); 11471573Srgrimes 11481573Srgrimes return(cs); 11491573Srgrimes} 11501573Srgrimes 11511573Srgrimes/* 11521573Srgrimes - freeset - free a now-unused set 115392889Sobrien == static void freeset(struct parse *p, cset *cs); 11541573Srgrimes */ 11551573Srgrimesstatic void 11561573Srgrimesfreeset(p, cs) 115792889Sobrienstruct parse *p; 115892889Sobriencset *cs; 11591573Srgrimes{ 116092889Sobrien cset *top = &p->g->sets[p->g->ncsets]; 11611573Srgrimes 1162132019Stjr free(cs->wides); 1163132019Stjr free(cs->ranges); 1164132019Stjr free(cs->types); 1165132019Stjr memset(cs, 0, sizeof(*cs)); 11661573Srgrimes if (cs == top-1) /* recover only the easy case */ 11671573Srgrimes p->g->ncsets--; 11681573Srgrimes} 11691573Srgrimes 11701573Srgrimes/* 1171132019Stjr - singleton - Determine whether a set contains only one character, 1172132019Stjr - returning it if so, otherwise returning OUT. 11731573Srgrimes */ 1174132019Stjrstatic wint_t 1175132019Stjrsingleton(cs) 117692889Sobriencset *cs; 11771573Srgrimes{ 1178132019Stjr wint_t i, s, n; 11791573Srgrimes 1180132019Stjr for (i = n = 0; i < NC; i++) 1181132019Stjr if (CHIN(cs, i)) { 1182132019Stjr n++; 1183132019Stjr s = i; 11841573Srgrimes } 1185132019Stjr if (n == 1) 1186132019Stjr return (s); 1187132019Stjr if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1188132019Stjr cs->icase == 0) 1189132019Stjr return (cs->wides[0]); 1190132019Stjr /* Don't bother handling the other cases. */ 1191132019Stjr return (OUT); 1192132019Stjr} 11931573Srgrimes 1194132019Stjr/* 1195132019Stjr - CHadd - add character to character set. 1196132019Stjr */ 1197132019Stjrstatic void 1198132019StjrCHadd(p, cs, ch) 1199132019Stjrstruct parse *p; 1200132019Stjrcset *cs; 1201132019Stjrwint_t ch; 1202132019Stjr{ 1203134802Stjr wint_t nch, *newwides; 1204132019Stjr assert(ch >= 0); 1205134802Stjr if (ch < NC) 1206132019Stjr cs->bmp[ch >> 3] |= 1 << (ch & 7); 1207134802Stjr else { 1208132019Stjr newwides = realloc(cs->wides, (cs->nwides + 1) * 1209132019Stjr sizeof(*cs->wides)); 1210132019Stjr if (newwides == NULL) { 1211132019Stjr SETERROR(REG_ESPACE); 1212132019Stjr return; 1213132019Stjr } 1214132019Stjr cs->wides = newwides; 1215132019Stjr cs->wides[cs->nwides++] = ch; 12161573Srgrimes } 1217134802Stjr if (cs->icase) { 1218134802Stjr if ((nch = towlower(ch)) < NC) 1219134802Stjr cs->bmp[nch >> 3] |= 1 << (nch & 7); 1220134802Stjr if ((nch = towupper(ch)) < NC) 1221134802Stjr cs->bmp[nch >> 3] |= 1 << (nch & 7); 1222134802Stjr } 12231573Srgrimes} 12241573Srgrimes 12251573Srgrimes/* 1226132019Stjr - CHaddrange - add all characters in the range [min,max] to a character set. 12271573Srgrimes */ 1228132019Stjrstatic void 1229132019StjrCHaddrange(p, cs, min, max) 123092889Sobrienstruct parse *p; 123192889Sobriencset *cs; 1232132019Stjrwint_t min, max; 12331573Srgrimes{ 1234132019Stjr crange *newranges; 12351573Srgrimes 1236132019Stjr for (; min < NC && min <= max; min++) 1237132019Stjr CHadd(p, cs, min); 1238132019Stjr if (min >= max) 1239132019Stjr return; 1240132019Stjr newranges = realloc(cs->ranges, (cs->nranges + 1) * 1241132019Stjr sizeof(*cs->ranges)); 1242132019Stjr if (newranges == NULL) { 1243132019Stjr SETERROR(REG_ESPACE); 1244132019Stjr return; 1245132019Stjr } 1246132019Stjr cs->ranges = newranges; 1247132019Stjr cs->ranges[cs->nranges].min = min; 1248132019Stjr cs->ranges[cs->nranges].min = max; 1249132019Stjr cs->nranges++; 12501573Srgrimes} 12511573Srgrimes 12521573Srgrimes/* 1253132019Stjr - CHaddtype - add all characters of a certain type to a character set. 12541573Srgrimes */ 1255132019Stjrstatic void 1256132019StjrCHaddtype(p, cs, wct) 125792889Sobrienstruct parse *p; 125892889Sobriencset *cs; 1259132019Stjrwctype_t wct; 12601573Srgrimes{ 1261132019Stjr wint_t i; 1262132019Stjr wctype_t *newtypes; 12631573Srgrimes 1264134802Stjr for (i = 0; i < NC; i++) 1265132019Stjr if (iswctype(i, wct)) 1266132019Stjr CHadd(p, cs, i); 1267132019Stjr newtypes = realloc(cs->types, (cs->ntypes + 1) * 1268132019Stjr sizeof(*cs->types)); 1269132019Stjr if (newtypes == NULL) { 1270132019Stjr SETERROR(REG_ESPACE); 1271132019Stjr return; 1272132019Stjr } 1273132019Stjr cs->types = newtypes; 1274132019Stjr cs->types[cs->ntypes++] = wct; 12751573Srgrimes} 12761573Srgrimes 12771573Srgrimes/* 12781573Srgrimes - dupl - emit a duplicate of a bunch of sops 127992889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish); 12801573Srgrimes */ 12811573Srgrimesstatic sopno /* start of duplicate */ 12821573Srgrimesdupl(p, start, finish) 128392889Sobrienstruct parse *p; 12841573Srgrimessopno start; /* from here */ 12851573Srgrimessopno finish; /* to this less one */ 12861573Srgrimes{ 128792889Sobrien sopno ret = HERE(); 128892889Sobrien sopno len = finish - start; 12891573Srgrimes 12901573Srgrimes assert(finish >= start); 12911573Srgrimes if (len == 0) 12921573Srgrimes return(ret); 12931573Srgrimes enlarge(p, p->ssize + len); /* this many unexpected additions */ 12941573Srgrimes assert(p->ssize >= p->slen + len); 12951573Srgrimes (void) memcpy((char *)(p->strip + p->slen), 12961573Srgrimes (char *)(p->strip + start), (size_t)len*sizeof(sop)); 12971573Srgrimes p->slen += len; 12981573Srgrimes return(ret); 12991573Srgrimes} 13001573Srgrimes 13011573Srgrimes/* 13021573Srgrimes - doemit - emit a strip operator 130392889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd); 13041573Srgrimes * 13051573Srgrimes * It might seem better to implement this as a macro with a function as 13061573Srgrimes * hard-case backup, but it's just too big and messy unless there are 13071573Srgrimes * some changes to the data structures. Maybe later. 13081573Srgrimes */ 13091573Srgrimesstatic void 13101573Srgrimesdoemit(p, op, opnd) 131192889Sobrienstruct parse *p; 13121573Srgrimessop op; 13131573Srgrimessize_t opnd; 13141573Srgrimes{ 13151573Srgrimes /* avoid making error situations worse */ 13161573Srgrimes if (p->error != 0) 13171573Srgrimes return; 13181573Srgrimes 13191573Srgrimes /* deal with oversize operands ("can't happen", more or less) */ 13201573Srgrimes assert(opnd < 1<<OPSHIFT); 13211573Srgrimes 13221573Srgrimes /* deal with undersized strip */ 13231573Srgrimes if (p->slen >= p->ssize) 13241573Srgrimes enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 13251573Srgrimes assert(p->slen < p->ssize); 13261573Srgrimes 13271573Srgrimes /* finally, it's all reduced to the easy case */ 13281573Srgrimes p->strip[p->slen++] = SOP(op, opnd); 13291573Srgrimes} 13301573Srgrimes 13311573Srgrimes/* 13321573Srgrimes - doinsert - insert a sop into the strip 133392889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 13341573Srgrimes */ 13351573Srgrimesstatic void 13361573Srgrimesdoinsert(p, op, opnd, pos) 133792889Sobrienstruct parse *p; 13381573Srgrimessop op; 13391573Srgrimessize_t opnd; 13401573Srgrimessopno pos; 13411573Srgrimes{ 134292889Sobrien sopno sn; 134392889Sobrien sop s; 134492889Sobrien int i; 13451573Srgrimes 13461573Srgrimes /* avoid making error situations worse */ 13471573Srgrimes if (p->error != 0) 13481573Srgrimes return; 13491573Srgrimes 13501573Srgrimes sn = HERE(); 13511573Srgrimes EMIT(op, opnd); /* do checks, ensure space */ 13521573Srgrimes assert(HERE() == sn+1); 13531573Srgrimes s = p->strip[sn]; 13541573Srgrimes 13551573Srgrimes /* adjust paren pointers */ 13561573Srgrimes assert(pos > 0); 13571573Srgrimes for (i = 1; i < NPAREN; i++) { 13581573Srgrimes if (p->pbegin[i] >= pos) { 13591573Srgrimes p->pbegin[i]++; 13601573Srgrimes } 13611573Srgrimes if (p->pend[i] >= pos) { 13621573Srgrimes p->pend[i]++; 13631573Srgrimes } 13641573Srgrimes } 13651573Srgrimes 13661573Srgrimes memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 13671573Srgrimes (HERE()-pos-1)*sizeof(sop)); 13681573Srgrimes p->strip[pos] = s; 13691573Srgrimes} 13701573Srgrimes 13711573Srgrimes/* 13721573Srgrimes - dofwd - complete a forward reference 137392889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value); 13741573Srgrimes */ 13751573Srgrimesstatic void 13761573Srgrimesdofwd(p, pos, value) 137792889Sobrienstruct parse *p; 137892889Sobriensopno pos; 13791573Srgrimessop value; 13801573Srgrimes{ 13811573Srgrimes /* avoid making error situations worse */ 13821573Srgrimes if (p->error != 0) 13831573Srgrimes return; 13841573Srgrimes 13851573Srgrimes assert(value < 1<<OPSHIFT); 13861573Srgrimes p->strip[pos] = OP(p->strip[pos]) | value; 13871573Srgrimes} 13881573Srgrimes 13891573Srgrimes/* 13901573Srgrimes - enlarge - enlarge the strip 139192889Sobrien == static void enlarge(struct parse *p, sopno size); 13921573Srgrimes */ 13931573Srgrimesstatic void 13941573Srgrimesenlarge(p, size) 139592889Sobrienstruct parse *p; 139692889Sobriensopno size; 13971573Srgrimes{ 139892889Sobrien sop *sp; 13991573Srgrimes 14001573Srgrimes if (p->ssize >= size) 14011573Srgrimes return; 14021573Srgrimes 14031573Srgrimes sp = (sop *)realloc(p->strip, size*sizeof(sop)); 14041573Srgrimes if (sp == NULL) { 14051573Srgrimes SETERROR(REG_ESPACE); 14061573Srgrimes return; 14071573Srgrimes } 14081573Srgrimes p->strip = sp; 14091573Srgrimes p->ssize = size; 14101573Srgrimes} 14111573Srgrimes 14121573Srgrimes/* 14131573Srgrimes - stripsnug - compact the strip 141492889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g); 14151573Srgrimes */ 14161573Srgrimesstatic void 14171573Srgrimesstripsnug(p, g) 141892889Sobrienstruct parse *p; 141992889Sobrienstruct re_guts *g; 14201573Srgrimes{ 14211573Srgrimes g->nstates = p->slen; 14221573Srgrimes g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 14231573Srgrimes if (g->strip == NULL) { 14241573Srgrimes SETERROR(REG_ESPACE); 14251573Srgrimes g->strip = p->strip; 14261573Srgrimes } 14271573Srgrimes} 14281573Srgrimes 14291573Srgrimes/* 14301573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string 143192889Sobrien == static void findmust(struct parse *p, struct re_guts *g); 14321573Srgrimes * 14331573Srgrimes * This algorithm could do fancy things like analyzing the operands of | 14341573Srgrimes * for common subsequences. Someday. This code is simple and finds most 14351573Srgrimes * of the interesting cases. 14361573Srgrimes * 14371573Srgrimes * Note that must and mlen got initialized during setup. 14381573Srgrimes */ 14391573Srgrimesstatic void 14401573Srgrimesfindmust(p, g) 14411573Srgrimesstruct parse *p; 144292889Sobrienstruct re_guts *g; 14431573Srgrimes{ 144492889Sobrien sop *scan; 14451573Srgrimes sop *start; 144692889Sobrien sop *newstart; 144792889Sobrien sopno newlen; 144892889Sobrien sop s; 144992889Sobrien char *cp; 145062391Sdcs int offset; 1451132019Stjr char buf[MB_LEN_MAX]; 1452132019Stjr size_t clen; 1453132019Stjr mbstate_t mbs; 14541573Srgrimes 14551573Srgrimes /* avoid making error situations worse */ 14561573Srgrimes if (p->error != 0) 14571573Srgrimes return; 14581573Srgrimes 1459132019Stjr /* 1460132019Stjr * It's not generally safe to do a ``char'' substring search on 1461132019Stjr * multibyte character strings, but it's safe for at least 1462132019Stjr * UTF-8 (see RFC 3629). 1463132019Stjr */ 1464132019Stjr if (MB_CUR_MAX > 1 && 1465132019Stjr strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1466132019Stjr return; 1467132019Stjr 14681573Srgrimes /* find the longest OCHAR sequence in strip */ 14691573Srgrimes newlen = 0; 147062391Sdcs offset = 0; 147162391Sdcs g->moffset = 0; 14721573Srgrimes scan = g->strip + 1; 14731573Srgrimes do { 14741573Srgrimes s = *scan++; 14751573Srgrimes switch (OP(s)) { 14761573Srgrimes case OCHAR: /* sequence member */ 1477132019Stjr if (newlen == 0) { /* new sequence */ 1478132019Stjr memset(&mbs, 0, sizeof(mbs)); 14791573Srgrimes newstart = scan - 1; 1480132019Stjr } 1481132019Stjr clen = wcrtomb(buf, OPND(s), &mbs); 1482132019Stjr if (clen == (size_t)-1) 1483132019Stjr goto toohard; 1484132019Stjr newlen += clen; 14851573Srgrimes break; 14861573Srgrimes case OPLUS_: /* things that don't break one */ 14871573Srgrimes case OLPAREN: 14881573Srgrimes case ORPAREN: 14891573Srgrimes break; 14901573Srgrimes case OQUEST_: /* things that must be skipped */ 14911573Srgrimes case OCH_: 1492131973Stjr offset = altoffset(scan, offset); 14931573Srgrimes scan--; 14941573Srgrimes do { 14951573Srgrimes scan += OPND(s); 14961573Srgrimes s = *scan; 14971573Srgrimes /* assert() interferes w debug printouts */ 14981573Srgrimes if (OP(s) != O_QUEST && OP(s) != O_CH && 14991573Srgrimes OP(s) != OOR2) { 15001573Srgrimes g->iflags |= BAD; 15011573Srgrimes return; 15021573Srgrimes } 15031573Srgrimes } while (OP(s) != O_QUEST && OP(s) != O_CH); 1504102411Scharnier /* FALLTHROUGH */ 150562391Sdcs case OBOW: /* things that break a sequence */ 150662391Sdcs case OEOW: 150762391Sdcs case OBOL: 150862391Sdcs case OEOL: 150962391Sdcs case O_QUEST: 151062391Sdcs case O_CH: 151162391Sdcs case OEND: 15121573Srgrimes if (newlen > g->mlen) { /* ends one */ 15131573Srgrimes start = newstart; 15141573Srgrimes g->mlen = newlen; 151562391Sdcs if (offset > -1) { 151662391Sdcs g->moffset += offset; 151762391Sdcs offset = newlen; 151862391Sdcs } else 151962391Sdcs g->moffset = offset; 152062391Sdcs } else { 152162391Sdcs if (offset > -1) 152262391Sdcs offset += newlen; 15231573Srgrimes } 15241573Srgrimes newlen = 0; 15251573Srgrimes break; 152662391Sdcs case OANY: 152762391Sdcs if (newlen > g->mlen) { /* ends one */ 152862391Sdcs start = newstart; 152962391Sdcs g->mlen = newlen; 153062391Sdcs if (offset > -1) { 153162391Sdcs g->moffset += offset; 153262391Sdcs offset = newlen; 153362391Sdcs } else 153462391Sdcs g->moffset = offset; 153562391Sdcs } else { 153662391Sdcs if (offset > -1) 153762391Sdcs offset += newlen; 153862391Sdcs } 153962391Sdcs if (offset > -1) 154062391Sdcs offset++; 154162391Sdcs newlen = 0; 154262391Sdcs break; 154362391Sdcs case OANYOF: /* may or may not invalidate offset */ 154462391Sdcs /* First, everything as OANY */ 154562391Sdcs if (newlen > g->mlen) { /* ends one */ 154662391Sdcs start = newstart; 154762391Sdcs g->mlen = newlen; 154862391Sdcs if (offset > -1) { 154962391Sdcs g->moffset += offset; 155062391Sdcs offset = newlen; 155162391Sdcs } else 155262391Sdcs g->moffset = offset; 155362391Sdcs } else { 155462391Sdcs if (offset > -1) 155562391Sdcs offset += newlen; 155662391Sdcs } 155762391Sdcs if (offset > -1) 155862391Sdcs offset++; 155962391Sdcs newlen = 0; 156062391Sdcs break; 1561132019Stjr toohard: 156262391Sdcs default: 156362391Sdcs /* Anything here makes it impossible or too hard 156462391Sdcs * to calculate the offset -- so we give up; 156562391Sdcs * save the last known good offset, in case the 156662391Sdcs * must sequence doesn't occur later. 156762391Sdcs */ 156862391Sdcs if (newlen > g->mlen) { /* ends one */ 156962391Sdcs start = newstart; 157062391Sdcs g->mlen = newlen; 157162391Sdcs if (offset > -1) 157262391Sdcs g->moffset += offset; 157362391Sdcs else 157462391Sdcs g->moffset = offset; 157562391Sdcs } 157662391Sdcs offset = -1; 157762391Sdcs newlen = 0; 157862391Sdcs break; 15791573Srgrimes } 15801573Srgrimes } while (OP(s) != OEND); 15811573Srgrimes 158262391Sdcs if (g->mlen == 0) { /* there isn't one */ 158362391Sdcs g->moffset = -1; 15841573Srgrimes return; 158562391Sdcs } 15861573Srgrimes 15871573Srgrimes /* turn it into a character string */ 15881573Srgrimes g->must = malloc((size_t)g->mlen + 1); 15891573Srgrimes if (g->must == NULL) { /* argh; just forget it */ 15901573Srgrimes g->mlen = 0; 159162391Sdcs g->moffset = -1; 15921573Srgrimes return; 15931573Srgrimes } 15941573Srgrimes cp = g->must; 15951573Srgrimes scan = start; 1596132019Stjr memset(&mbs, 0, sizeof(mbs)); 1597132019Stjr while (cp < g->must + g->mlen) { 15981573Srgrimes while (OP(s = *scan++) != OCHAR) 15991573Srgrimes continue; 1600132019Stjr clen = wcrtomb(cp, OPND(s), &mbs); 1601132019Stjr assert(clen != (size_t)-1); 1602132019Stjr cp += clen; 16031573Srgrimes } 16041573Srgrimes assert(cp == g->must + g->mlen); 16051573Srgrimes *cp++ = '\0'; /* just on general principles */ 16061573Srgrimes} 16071573Srgrimes 16081573Srgrimes/* 160962391Sdcs - altoffset - choose biggest offset among multiple choices 1610131973Stjr == static int altoffset(sop *scan, int offset); 161162391Sdcs * 161262391Sdcs * Compute, recursively if necessary, the largest offset among multiple 161362391Sdcs * re paths. 161462391Sdcs */ 161562391Sdcsstatic int 1616131973Stjraltoffset(scan, offset) 161762391Sdcssop *scan; 161862391Sdcsint offset; 161962391Sdcs{ 162062391Sdcs int largest; 162162391Sdcs int try; 162262391Sdcs sop s; 162362391Sdcs 162462391Sdcs /* If we gave up already on offsets, return */ 162562391Sdcs if (offset == -1) 162662391Sdcs return -1; 162762391Sdcs 162862391Sdcs largest = 0; 162962391Sdcs try = 0; 163062391Sdcs s = *scan++; 163162391Sdcs while (OP(s) != O_QUEST && OP(s) != O_CH) { 163262391Sdcs switch (OP(s)) { 163362391Sdcs case OOR1: 163462391Sdcs if (try > largest) 163562391Sdcs largest = try; 163662391Sdcs try = 0; 163762391Sdcs break; 163862391Sdcs case OQUEST_: 163962391Sdcs case OCH_: 1640131973Stjr try = altoffset(scan, try); 164162391Sdcs if (try == -1) 164262391Sdcs return -1; 164362391Sdcs scan--; 164462391Sdcs do { 164562391Sdcs scan += OPND(s); 164662391Sdcs s = *scan; 164762391Sdcs if (OP(s) != O_QUEST && OP(s) != O_CH && 164862391Sdcs OP(s) != OOR2) 164962391Sdcs return -1; 165062391Sdcs } while (OP(s) != O_QUEST && OP(s) != O_CH); 165162855Sdcs /* We must skip to the next position, or we'll 165262855Sdcs * leave altoffset() too early. 165362855Sdcs */ 165462855Sdcs scan++; 165562391Sdcs break; 165662391Sdcs case OANYOF: 165762391Sdcs case OCHAR: 165862391Sdcs case OANY: 165962391Sdcs try++; 166062391Sdcs case OBOW: 166162391Sdcs case OEOW: 166262391Sdcs case OLPAREN: 166362391Sdcs case ORPAREN: 166462391Sdcs case OOR2: 166562391Sdcs break; 166662391Sdcs default: 166762391Sdcs try = -1; 166862391Sdcs break; 166962391Sdcs } 167062391Sdcs if (try == -1) 167162391Sdcs return -1; 167262391Sdcs s = *scan++; 167362391Sdcs } 167462391Sdcs 167562391Sdcs if (try > largest) 167662391Sdcs largest = try; 167762391Sdcs 167862391Sdcs return largest+offset; 167962391Sdcs} 168062391Sdcs 168162391Sdcs/* 168262232Sdcs - computejumps - compute char jumps for BM scan 168392889Sobrien == static void computejumps(struct parse *p, struct re_guts *g); 168462232Sdcs * 168562232Sdcs * This algorithm assumes g->must exists and is has size greater than 168662232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 168762232Sdcs * Sara Baase. 168862232Sdcs * 168962232Sdcs * A char jump is the number of characters one needs to jump based on 169062232Sdcs * the value of the character from the text that was mismatched. 169162232Sdcs */ 169262232Sdcsstatic void 169362232Sdcscomputejumps(p, g) 169462232Sdcsstruct parse *p; 169562232Sdcsstruct re_guts *g; 169662232Sdcs{ 169762232Sdcs int ch; 169862232Sdcs int mindex; 169962232Sdcs 170062232Sdcs /* Avoid making errors worse */ 170162232Sdcs if (p->error != 0) 170262232Sdcs return; 170362232Sdcs 170462848Sdcs g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 170562232Sdcs if (g->charjump == NULL) /* Not a fatal error */ 170662232Sdcs return; 170762754Sdcs /* Adjust for signed chars, if necessary */ 170862754Sdcs g->charjump = &g->charjump[-(CHAR_MIN)]; 170962232Sdcs 171062232Sdcs /* If the character does not exist in the pattern, the jump 171162232Sdcs * is equal to the number of characters in the pattern. 171262232Sdcs */ 171362754Sdcs for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 171462232Sdcs g->charjump[ch] = g->mlen; 171562232Sdcs 171662232Sdcs /* If the character does exist, compute the jump that would 171762232Sdcs * take us to the last character in the pattern equal to it 171862232Sdcs * (notice that we match right to left, so that last character 171962232Sdcs * is the first one that would be matched). 172062232Sdcs */ 172162232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 1722111010Snectar g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 172362232Sdcs} 172462232Sdcs 172562232Sdcs/* 172662232Sdcs - computematchjumps - compute match jumps for BM scan 172792889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g); 172862232Sdcs * 172962232Sdcs * This algorithm assumes g->must exists and is has size greater than 173062232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by 173162232Sdcs * Sara Baase. 173262232Sdcs * 173362232Sdcs * A match jump is the number of characters one needs to advance based 173462232Sdcs * on the already-matched suffix. 173562232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way 173662232Sdcs * the search algorithm works. 173762232Sdcs */ 173862232Sdcsstatic void 173962232Sdcscomputematchjumps(p, g) 174062232Sdcsstruct parse *p; 174162232Sdcsstruct re_guts *g; 174262232Sdcs{ 174362232Sdcs int mindex; /* General "must" iterator */ 174462232Sdcs int suffix; /* Keeps track of matching suffix */ 174562232Sdcs int ssuffix; /* Keeps track of suffixes' suffix */ 174662232Sdcs int* pmatches; /* pmatches[k] points to the next i 174762232Sdcs * such that i+1...mlen is a substring 174862232Sdcs * of k+1...k+mlen-i-1 174962232Sdcs */ 175062232Sdcs 175162232Sdcs /* Avoid making errors worse */ 175262232Sdcs if (p->error != 0) 175362232Sdcs return; 175462232Sdcs 175562848Sdcs pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 175662232Sdcs if (pmatches == NULL) { 175762232Sdcs g->matchjump = NULL; 175862232Sdcs return; 175962232Sdcs } 176062232Sdcs 176162848Sdcs g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 176262232Sdcs if (g->matchjump == NULL) /* Not a fatal error */ 176362232Sdcs return; 176462232Sdcs 176562232Sdcs /* Set maximum possible jump for each character in the pattern */ 176662232Sdcs for (mindex = 0; mindex < g->mlen; mindex++) 176762232Sdcs g->matchjump[mindex] = 2*g->mlen - mindex - 1; 176862232Sdcs 176962232Sdcs /* Compute pmatches[] */ 177062232Sdcs for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 177162232Sdcs mindex--, suffix--) { 177262232Sdcs pmatches[mindex] = suffix; 177362232Sdcs 177462232Sdcs /* If a mismatch is found, interrupting the substring, 177562232Sdcs * compute the matchjump for that position. If no 177662232Sdcs * mismatch is found, then a text substring mismatched 177762232Sdcs * against the suffix will also mismatch against the 177862232Sdcs * substring. 177962232Sdcs */ 178062232Sdcs while (suffix < g->mlen 178162232Sdcs && g->must[mindex] != g->must[suffix]) { 178262232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 178362232Sdcs g->mlen - mindex - 1); 178462232Sdcs suffix = pmatches[suffix]; 178562232Sdcs } 178662232Sdcs } 178762232Sdcs 178862232Sdcs /* Compute the matchjump up to the last substring found to jump 178962232Sdcs * to the beginning of the largest must pattern prefix matching 179062232Sdcs * it's own suffix. 179162232Sdcs */ 179262232Sdcs for (mindex = 0; mindex <= suffix; mindex++) 179362232Sdcs g->matchjump[mindex] = MIN(g->matchjump[mindex], 179462232Sdcs g->mlen + suffix - mindex); 179562232Sdcs 179662232Sdcs ssuffix = pmatches[suffix]; 179762391Sdcs while (suffix < g->mlen) { 179862673Sdcs while (suffix <= ssuffix && suffix < g->mlen) { 179962232Sdcs g->matchjump[suffix] = MIN(g->matchjump[suffix], 180062232Sdcs g->mlen + ssuffix - suffix); 180162232Sdcs suffix++; 180262232Sdcs } 180386208Sdcs if (suffix < g->mlen) 180486208Sdcs ssuffix = pmatches[ssuffix]; 180562232Sdcs } 180662232Sdcs 180762232Sdcs free(pmatches); 180862232Sdcs} 180962232Sdcs 181062232Sdcs/* 18111573Srgrimes - pluscount - count + nesting 181292889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g); 18131573Srgrimes */ 18141573Srgrimesstatic sopno /* nesting depth */ 18151573Srgrimespluscount(p, g) 18161573Srgrimesstruct parse *p; 181792889Sobrienstruct re_guts *g; 18181573Srgrimes{ 181992889Sobrien sop *scan; 182092889Sobrien sop s; 182192889Sobrien sopno plusnest = 0; 182292889Sobrien sopno maxnest = 0; 18231573Srgrimes 18241573Srgrimes if (p->error != 0) 18251573Srgrimes return(0); /* there may not be an OEND */ 18261573Srgrimes 18271573Srgrimes scan = g->strip + 1; 18281573Srgrimes do { 18291573Srgrimes s = *scan++; 18301573Srgrimes switch (OP(s)) { 18311573Srgrimes case OPLUS_: 18321573Srgrimes plusnest++; 18331573Srgrimes break; 18341573Srgrimes case O_PLUS: 18351573Srgrimes if (plusnest > maxnest) 18361573Srgrimes maxnest = plusnest; 18371573Srgrimes plusnest--; 18381573Srgrimes break; 18391573Srgrimes } 18401573Srgrimes } while (OP(s) != OEND); 18411573Srgrimes if (plusnest != 0) 18421573Srgrimes g->iflags |= BAD; 18431573Srgrimes return(maxnest); 18441573Srgrimes} 1845