regcomp.c revision 17514
11573Srgrimes/*- 21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer. 31573Srgrimes * Copyright (c) 1992, 1993, 1994 41573Srgrimes * The Regents of the University of California. All rights reserved. 51573Srgrimes * 61573Srgrimes * This code is derived from software contributed to Berkeley by 71573Srgrimes * Henry Spencer. 81573Srgrimes * 91573Srgrimes * Redistribution and use in source and binary forms, with or without 101573Srgrimes * modification, are permitted provided that the following conditions 111573Srgrimes * are met: 121573Srgrimes * 1. Redistributions of source code must retain the above copyright 131573Srgrimes * notice, this list of conditions and the following disclaimer. 141573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 151573Srgrimes * notice, this list of conditions and the following disclaimer in the 161573Srgrimes * documentation and/or other materials provided with the distribution. 171573Srgrimes * 3. All advertising materials mentioning features or use of this software 181573Srgrimes * must display the following acknowledgement: 191573Srgrimes * This product includes software developed by the University of 201573Srgrimes * California, Berkeley and its contributors. 211573Srgrimes * 4. Neither the name of the University nor the names of its contributors 221573Srgrimes * may be used to endorse or promote products derived from this software 231573Srgrimes * without specific prior written permission. 241573Srgrimes * 251573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 261573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 271573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 281573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 291573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 301573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 311573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 321573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 331573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 341573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 351573Srgrimes * SUCH DAMAGE. 361573Srgrimes * 371573Srgrimes * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 381573Srgrimes */ 391573Srgrimes 401573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 411573Srgrimesstatic char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 421573Srgrimes#endif /* LIBC_SCCS and not lint */ 431573Srgrimes 441573Srgrimes#include <sys/types.h> 451573Srgrimes#include <stdio.h> 461573Srgrimes#include <string.h> 471573Srgrimes#include <ctype.h> 481573Srgrimes#include <limits.h> 491573Srgrimes#include <stdlib.h> 501573Srgrimes#include <regex.h> 511573Srgrimes 521573Srgrimes#include "utils.h" 531573Srgrimes#include "regex2.h" 541573Srgrimes 551573Srgrimes#include "cclass.h" 561573Srgrimes#include "cname.h" 571573Srgrimes 581573Srgrimes/* 591573Srgrimes * parse structure, passed up and down to avoid global variables and 601573Srgrimes * other clumsinesses 611573Srgrimes */ 621573Srgrimesstruct parse { 631573Srgrimes char *next; /* next character in RE */ 641573Srgrimes char *end; /* end of string (-> NUL normally) */ 651573Srgrimes int error; /* has an error been seen? */ 661573Srgrimes sop *strip; /* malloced strip */ 671573Srgrimes sopno ssize; /* malloced strip size (allocated) */ 681573Srgrimes sopno slen; /* malloced strip length (used) */ 691573Srgrimes int ncsalloc; /* number of csets allocated */ 701573Srgrimes struct re_guts *g; 711573Srgrimes# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 721573Srgrimes sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 731573Srgrimes sopno pend[NPAREN]; /* -> ) ([0] unused) */ 741573Srgrimes}; 751573Srgrimes 761573Srgrimes/* ========= begin header generated by ./mkh ========= */ 771573Srgrimes#ifdef __cplusplus 781573Srgrimesextern "C" { 791573Srgrimes#endif 801573Srgrimes 811573Srgrimes/* === regcomp.c === */ 821573Srgrimesstatic void p_ere __P((struct parse *p, int stop)); 831573Srgrimesstatic void p_ere_exp __P((struct parse *p)); 841573Srgrimesstatic void p_str __P((struct parse *p)); 851573Srgrimesstatic void p_bre __P((struct parse *p, int end1, int end2)); 861573Srgrimesstatic int p_simp_re __P((struct parse *p, int starordinary)); 871573Srgrimesstatic int p_count __P((struct parse *p)); 881573Srgrimesstatic void p_bracket __P((struct parse *p)); 891573Srgrimesstatic void p_b_term __P((struct parse *p, cset *cs)); 901573Srgrimesstatic void p_b_cclass __P((struct parse *p, cset *cs)); 911573Srgrimesstatic void p_b_eclass __P((struct parse *p, cset *cs)); 921573Srgrimesstatic char p_b_symbol __P((struct parse *p)); 931573Srgrimesstatic char p_b_coll_elem __P((struct parse *p, int endc)); 941573Srgrimesstatic char othercase __P((int ch)); 951573Srgrimesstatic void bothcases __P((struct parse *p, int ch)); 961573Srgrimesstatic void ordinary __P((struct parse *p, int ch)); 971573Srgrimesstatic void nonnewline __P((struct parse *p)); 981573Srgrimesstatic void repeat __P((struct parse *p, sopno start, int from, int to)); 991573Srgrimesstatic int seterr __P((struct parse *p, int e)); 1001573Srgrimesstatic cset *allocset __P((struct parse *p)); 1011573Srgrimesstatic void freeset __P((struct parse *p, cset *cs)); 1021573Srgrimesstatic int freezeset __P((struct parse *p, cset *cs)); 1031573Srgrimesstatic int firstch __P((struct parse *p, cset *cs)); 1041573Srgrimesstatic int nch __P((struct parse *p, cset *cs)); 1051573Srgrimesstatic void mcadd __P((struct parse *p, cset *cs, char *cp)); 10617141Sjkh#if used 1071573Srgrimesstatic void mcsub __P((cset *cs, char *cp)); 1081573Srgrimesstatic int mcin __P((cset *cs, char *cp)); 1091573Srgrimesstatic char *mcfind __P((cset *cs, char *cp)); 11017141Sjkh#endif 1111573Srgrimesstatic void mcinvert __P((struct parse *p, cset *cs)); 1121573Srgrimesstatic void mccase __P((struct parse *p, cset *cs)); 1131573Srgrimesstatic int isinsets __P((struct re_guts *g, int c)); 1141573Srgrimesstatic int samesets __P((struct re_guts *g, int c1, int c2)); 1151573Srgrimesstatic void categorize __P((struct parse *p, struct re_guts *g)); 1161573Srgrimesstatic sopno dupl __P((struct parse *p, sopno start, sopno finish)); 1171573Srgrimesstatic void doemit __P((struct parse *p, sop op, size_t opnd)); 1181573Srgrimesstatic void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 1191573Srgrimesstatic void dofwd __P((struct parse *p, sopno pos, sop value)); 1201573Srgrimesstatic void enlarge __P((struct parse *p, sopno size)); 1211573Srgrimesstatic void stripsnug __P((struct parse *p, struct re_guts *g)); 1221573Srgrimesstatic void findmust __P((struct parse *p, struct re_guts *g)); 1231573Srgrimesstatic sopno pluscount __P((struct parse *p, struct re_guts *g)); 12417514Sachestatic int collcmp __P((int c1, int c2)); 1251573Srgrimes 1261573Srgrimes#ifdef __cplusplus 1271573Srgrimes} 1281573Srgrimes#endif 1291573Srgrimes/* ========= end header generated by ./mkh ========= */ 1301573Srgrimes 1311573Srgrimesstatic char nuls[10]; /* place to point scanner in event of error */ 1321573Srgrimes 1331573Srgrimes/* 1341573Srgrimes * macros for use with parse structure 1351573Srgrimes * BEWARE: these know that the parse structure is named `p' !!! 1361573Srgrimes */ 1371573Srgrimes#define PEEK() (*p->next) 1381573Srgrimes#define PEEK2() (*(p->next+1)) 1391573Srgrimes#define MORE() (p->next < p->end) 1401573Srgrimes#define MORE2() (p->next+1 < p->end) 1411573Srgrimes#define SEE(c) (MORE() && PEEK() == (c)) 1421573Srgrimes#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 1431573Srgrimes#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 1441573Srgrimes#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 1451573Srgrimes#define NEXT() (p->next++) 1461573Srgrimes#define NEXT2() (p->next += 2) 1471573Srgrimes#define NEXTn(n) (p->next += (n)) 1481573Srgrimes#define GETNEXT() (*p->next++) 1491573Srgrimes#define SETERROR(e) seterr(p, (e)) 1501573Srgrimes#define REQUIRE(co, e) ((co) || SETERROR(e)) 1511573Srgrimes#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 1521573Srgrimes#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 1531573Srgrimes#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 1541573Srgrimes#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 1551573Srgrimes#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 1561573Srgrimes#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 1571573Srgrimes#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 1581573Srgrimes#define HERE() (p->slen) 1591573Srgrimes#define THERE() (p->slen - 1) 1601573Srgrimes#define THERETHERE() (p->slen - 2) 1611573Srgrimes#define DROP(n) (p->slen -= (n)) 1621573Srgrimes 1631573Srgrimes#ifndef NDEBUG 1641573Srgrimesstatic int never = 0; /* for use in asserts; shuts lint up */ 1651573Srgrimes#else 1661573Srgrimes#define never 0 /* some <assert.h>s have bugs too */ 1671573Srgrimes#endif 1681573Srgrimes 1691573Srgrimes/* 1701573Srgrimes - regcomp - interface for parser and compilation 1711573Srgrimes = extern int regcomp(regex_t *, const char *, int); 1721573Srgrimes = #define REG_BASIC 0000 1731573Srgrimes = #define REG_EXTENDED 0001 1741573Srgrimes = #define REG_ICASE 0002 1751573Srgrimes = #define REG_NOSUB 0004 1761573Srgrimes = #define REG_NEWLINE 0010 1771573Srgrimes = #define REG_NOSPEC 0020 1781573Srgrimes = #define REG_PEND 0040 1791573Srgrimes = #define REG_DUMP 0200 1801573Srgrimes */ 1811573Srgrimesint /* 0 success, otherwise REG_something */ 1821573Srgrimesregcomp(preg, pattern, cflags) 1831573Srgrimesregex_t *preg; 1841573Srgrimesconst char *pattern; 1851573Srgrimesint cflags; 1861573Srgrimes{ 1871573Srgrimes struct parse pa; 1881573Srgrimes register struct re_guts *g; 1891573Srgrimes register struct parse *p = &pa; 1901573Srgrimes register int i; 1911573Srgrimes register size_t len; 1921573Srgrimes#ifdef REDEBUG 1931573Srgrimes# define GOODFLAGS(f) (f) 1941573Srgrimes#else 1951573Srgrimes# define GOODFLAGS(f) ((f)&~REG_DUMP) 1961573Srgrimes#endif 1971573Srgrimes 1981573Srgrimes cflags = GOODFLAGS(cflags); 1991573Srgrimes if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 2001573Srgrimes return(REG_INVARG); 2011573Srgrimes 2021573Srgrimes if (cflags®_PEND) { 2031573Srgrimes if (preg->re_endp < pattern) 2041573Srgrimes return(REG_INVARG); 2051573Srgrimes len = preg->re_endp - pattern; 2061573Srgrimes } else 2071573Srgrimes len = strlen((char *)pattern); 2081573Srgrimes 2091573Srgrimes /* do the mallocs early so failure handling is easy */ 2101573Srgrimes g = (struct re_guts *)malloc(sizeof(struct re_guts) + 2111573Srgrimes (NC-1)*sizeof(cat_t)); 2121573Srgrimes if (g == NULL) 2131573Srgrimes return(REG_ESPACE); 2141573Srgrimes p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 2151573Srgrimes p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 2161573Srgrimes p->slen = 0; 2171573Srgrimes if (p->strip == NULL) { 2181573Srgrimes free((char *)g); 2191573Srgrimes return(REG_ESPACE); 2201573Srgrimes } 2211573Srgrimes 2221573Srgrimes /* set things up */ 2231573Srgrimes p->g = g; 2241573Srgrimes p->next = (char *)pattern; /* convenience; we do not modify it */ 2251573Srgrimes p->end = p->next + len; 2261573Srgrimes p->error = 0; 2271573Srgrimes p->ncsalloc = 0; 2281573Srgrimes for (i = 0; i < NPAREN; i++) { 2291573Srgrimes p->pbegin[i] = 0; 2301573Srgrimes p->pend[i] = 0; 2311573Srgrimes } 2321573Srgrimes g->csetsize = NC; 2331573Srgrimes g->sets = NULL; 2341573Srgrimes g->setbits = NULL; 2351573Srgrimes g->ncsets = 0; 2361573Srgrimes g->cflags = cflags; 2371573Srgrimes g->iflags = 0; 2381573Srgrimes g->nbol = 0; 2391573Srgrimes g->neol = 0; 2401573Srgrimes g->must = NULL; 2411573Srgrimes g->mlen = 0; 2421573Srgrimes g->nsub = 0; 2431573Srgrimes g->ncategories = 1; /* category 0 is "everything else" */ 2441573Srgrimes g->categories = &g->catspace[-(CHAR_MIN)]; 2451573Srgrimes (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 2461573Srgrimes g->backrefs = 0; 2471573Srgrimes 2481573Srgrimes /* do it */ 2491573Srgrimes EMIT(OEND, 0); 2501573Srgrimes g->firststate = THERE(); 2511573Srgrimes if (cflags®_EXTENDED) 2521573Srgrimes p_ere(p, OUT); 2531573Srgrimes else if (cflags®_NOSPEC) 2541573Srgrimes p_str(p); 2551573Srgrimes else 2561573Srgrimes p_bre(p, OUT, OUT); 2571573Srgrimes EMIT(OEND, 0); 2581573Srgrimes g->laststate = THERE(); 2591573Srgrimes 2601573Srgrimes /* tidy up loose ends and fill things in */ 2611573Srgrimes categorize(p, g); 2621573Srgrimes stripsnug(p, g); 2631573Srgrimes findmust(p, g); 2641573Srgrimes g->nplus = pluscount(p, g); 2651573Srgrimes g->magic = MAGIC2; 2661573Srgrimes preg->re_nsub = g->nsub; 2671573Srgrimes preg->re_g = g; 2681573Srgrimes preg->re_magic = MAGIC1; 2691573Srgrimes#ifndef REDEBUG 2701573Srgrimes /* not debugging, so can't rely on the assert() in regexec() */ 2711573Srgrimes if (g->iflags&BAD) 2721573Srgrimes SETERROR(REG_ASSERT); 2731573Srgrimes#endif 2741573Srgrimes 2751573Srgrimes /* win or lose, we're done */ 2761573Srgrimes if (p->error != 0) /* lose */ 2771573Srgrimes regfree(preg); 2781573Srgrimes return(p->error); 2791573Srgrimes} 2801573Srgrimes 2811573Srgrimes/* 2821573Srgrimes - p_ere - ERE parser top level, concatenation and alternation 2831573Srgrimes == static void p_ere(register struct parse *p, int stop); 2841573Srgrimes */ 2851573Srgrimesstatic void 2861573Srgrimesp_ere(p, stop) 2871573Srgrimesregister struct parse *p; 2881573Srgrimesint stop; /* character this ERE should end at */ 2891573Srgrimes{ 2901573Srgrimes register char c; 2911573Srgrimes register sopno prevback; 2921573Srgrimes register sopno prevfwd; 2931573Srgrimes register sopno conc; 2941573Srgrimes register int first = 1; /* is this the first alternative? */ 2951573Srgrimes 2961573Srgrimes for (;;) { 2971573Srgrimes /* do a bunch of concatenated expressions */ 2981573Srgrimes conc = HERE(); 2991573Srgrimes while (MORE() && (c = PEEK()) != '|' && c != stop) 3001573Srgrimes p_ere_exp(p); 30117141Sjkh (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 3021573Srgrimes 3031573Srgrimes if (!EAT('|')) 3041573Srgrimes break; /* NOTE BREAK OUT */ 3051573Srgrimes 3061573Srgrimes if (first) { 3071573Srgrimes INSERT(OCH_, conc); /* offset is wrong */ 3081573Srgrimes prevfwd = conc; 3091573Srgrimes prevback = conc; 3101573Srgrimes first = 0; 3111573Srgrimes } 3121573Srgrimes ASTERN(OOR1, prevback); 3131573Srgrimes prevback = THERE(); 3141573Srgrimes AHEAD(prevfwd); /* fix previous offset */ 3151573Srgrimes prevfwd = HERE(); 3161573Srgrimes EMIT(OOR2, 0); /* offset is very wrong */ 3171573Srgrimes } 3181573Srgrimes 3191573Srgrimes if (!first) { /* tail-end fixups */ 3201573Srgrimes AHEAD(prevfwd); 3211573Srgrimes ASTERN(O_CH, prevback); 3221573Srgrimes } 3231573Srgrimes 3241573Srgrimes assert(!MORE() || SEE(stop)); 3251573Srgrimes} 3261573Srgrimes 3271573Srgrimes/* 3281573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 3291573Srgrimes == static void p_ere_exp(register struct parse *p); 3301573Srgrimes */ 3311573Srgrimesstatic void 3321573Srgrimesp_ere_exp(p) 3331573Srgrimesregister struct parse *p; 3341573Srgrimes{ 3351573Srgrimes register char c; 3361573Srgrimes register sopno pos; 3371573Srgrimes register int count; 3381573Srgrimes register int count2; 3391573Srgrimes register sopno subno; 3401573Srgrimes int wascaret = 0; 3411573Srgrimes 3421573Srgrimes assert(MORE()); /* caller should have ensured this */ 3431573Srgrimes c = GETNEXT(); 3441573Srgrimes 3451573Srgrimes pos = HERE(); 3461573Srgrimes switch (c) { 3471573Srgrimes case '(': 34817141Sjkh (void)REQUIRE(MORE(), REG_EPAREN); 3491573Srgrimes p->g->nsub++; 3501573Srgrimes subno = p->g->nsub; 3511573Srgrimes if (subno < NPAREN) 3521573Srgrimes p->pbegin[subno] = HERE(); 3531573Srgrimes EMIT(OLPAREN, subno); 3541573Srgrimes if (!SEE(')')) 3551573Srgrimes p_ere(p, ')'); 3561573Srgrimes if (subno < NPAREN) { 3571573Srgrimes p->pend[subno] = HERE(); 3581573Srgrimes assert(p->pend[subno] != 0); 3591573Srgrimes } 3601573Srgrimes EMIT(ORPAREN, subno); 36117141Sjkh (void)MUSTEAT(')', REG_EPAREN); 3621573Srgrimes break; 3631573Srgrimes#ifndef POSIX_MISTAKE 3641573Srgrimes case ')': /* happens only if no current unmatched ( */ 3651573Srgrimes /* 3661573Srgrimes * You may ask, why the ifndef? Because I didn't notice 3671573Srgrimes * this until slightly too late for 1003.2, and none of the 3681573Srgrimes * other 1003.2 regular-expression reviewers noticed it at 3691573Srgrimes * all. So an unmatched ) is legal POSIX, at least until 3701573Srgrimes * we can get it fixed. 3711573Srgrimes */ 3721573Srgrimes SETERROR(REG_EPAREN); 3731573Srgrimes break; 3741573Srgrimes#endif 3751573Srgrimes case '^': 3761573Srgrimes EMIT(OBOL, 0); 3771573Srgrimes p->g->iflags |= USEBOL; 3781573Srgrimes p->g->nbol++; 3791573Srgrimes wascaret = 1; 3801573Srgrimes break; 3811573Srgrimes case '$': 3821573Srgrimes EMIT(OEOL, 0); 3831573Srgrimes p->g->iflags |= USEEOL; 3841573Srgrimes p->g->neol++; 3851573Srgrimes break; 3861573Srgrimes case '|': 3871573Srgrimes SETERROR(REG_EMPTY); 3881573Srgrimes break; 3891573Srgrimes case '*': 3901573Srgrimes case '+': 3911573Srgrimes case '?': 3921573Srgrimes SETERROR(REG_BADRPT); 3931573Srgrimes break; 3941573Srgrimes case '.': 3951573Srgrimes if (p->g->cflags®_NEWLINE) 3961573Srgrimes nonnewline(p); 3971573Srgrimes else 3981573Srgrimes EMIT(OANY, 0); 3991573Srgrimes break; 4001573Srgrimes case '[': 4011573Srgrimes p_bracket(p); 4021573Srgrimes break; 4031573Srgrimes case '\\': 40417141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 4051573Srgrimes c = GETNEXT(); 4061573Srgrimes ordinary(p, c); 4071573Srgrimes break; 4081573Srgrimes case '{': /* okay as ordinary except if digit follows */ 40917141Sjkh (void)REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 4101573Srgrimes /* FALLTHROUGH */ 4111573Srgrimes default: 4121573Srgrimes ordinary(p, c); 4131573Srgrimes break; 4141573Srgrimes } 4151573Srgrimes 4161573Srgrimes if (!MORE()) 4171573Srgrimes return; 4181573Srgrimes c = PEEK(); 4191573Srgrimes /* we call { a repetition if followed by a digit */ 4201573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 4211573Srgrimes (c == '{' && MORE2() && isdigit(PEEK2())) )) 4221573Srgrimes return; /* no repetition, we're done */ 4231573Srgrimes NEXT(); 4241573Srgrimes 42517141Sjkh (void)REQUIRE(!wascaret, REG_BADRPT); 4261573Srgrimes switch (c) { 4271573Srgrimes case '*': /* implemented as +? */ 4281573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 4291573Srgrimes INSERT(OPLUS_, pos); 4301573Srgrimes ASTERN(O_PLUS, pos); 4311573Srgrimes INSERT(OQUEST_, pos); 4321573Srgrimes ASTERN(O_QUEST, pos); 4331573Srgrimes break; 4341573Srgrimes case '+': 4351573Srgrimes INSERT(OPLUS_, pos); 4361573Srgrimes ASTERN(O_PLUS, pos); 4371573Srgrimes break; 4381573Srgrimes case '?': 4391573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 4401573Srgrimes INSERT(OCH_, pos); /* offset slightly wrong */ 4411573Srgrimes ASTERN(OOR1, pos); /* this one's right */ 4421573Srgrimes AHEAD(pos); /* fix the OCH_ */ 4431573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 4441573Srgrimes AHEAD(THERE()); /* ...so fix it */ 4451573Srgrimes ASTERN(O_CH, THERETHERE()); 4461573Srgrimes break; 4471573Srgrimes case '{': 4481573Srgrimes count = p_count(p); 4491573Srgrimes if (EAT(',')) { 4501573Srgrimes if (isdigit(PEEK())) { 4511573Srgrimes count2 = p_count(p); 45217141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 4531573Srgrimes } else /* single number with comma */ 4541573Srgrimes count2 = INFINITY; 4551573Srgrimes } else /* just a single number */ 4561573Srgrimes count2 = count; 4571573Srgrimes repeat(p, pos, count, count2); 4581573Srgrimes if (!EAT('}')) { /* error heuristics */ 4591573Srgrimes while (MORE() && PEEK() != '}') 4601573Srgrimes NEXT(); 46117141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 4621573Srgrimes SETERROR(REG_BADBR); 4631573Srgrimes } 4641573Srgrimes break; 4651573Srgrimes } 4661573Srgrimes 4671573Srgrimes if (!MORE()) 4681573Srgrimes return; 4691573Srgrimes c = PEEK(); 4701573Srgrimes if (!( c == '*' || c == '+' || c == '?' || 4711573Srgrimes (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 4721573Srgrimes return; 4731573Srgrimes SETERROR(REG_BADRPT); 4741573Srgrimes} 4751573Srgrimes 4761573Srgrimes/* 4771573Srgrimes - p_str - string (no metacharacters) "parser" 4781573Srgrimes == static void p_str(register struct parse *p); 4791573Srgrimes */ 4801573Srgrimesstatic void 4811573Srgrimesp_str(p) 4821573Srgrimesregister struct parse *p; 4831573Srgrimes{ 48417141Sjkh (void)REQUIRE(MORE(), REG_EMPTY); 4851573Srgrimes while (MORE()) 4861573Srgrimes ordinary(p, GETNEXT()); 4871573Srgrimes} 4881573Srgrimes 4891573Srgrimes/* 4901573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation 4911573Srgrimes == static void p_bre(register struct parse *p, register int end1, \ 4921573Srgrimes == register int end2); 4931573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check. 4941573Srgrimes * 4951573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first 4961573Srgrimes * taken as an ordinary character and then revised to be an anchor. The 4971573Srgrimes * only undesirable side effect is that '$' gets included as a character 4981573Srgrimes * category in such cases. This is fairly harmless; not worth fixing. 4991573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive. 5001573Srgrimes */ 5011573Srgrimesstatic void 5021573Srgrimesp_bre(p, end1, end2) 5031573Srgrimesregister struct parse *p; 5041573Srgrimesregister int end1; /* first terminating character */ 5051573Srgrimesregister int end2; /* second terminating character */ 5061573Srgrimes{ 5071573Srgrimes register sopno start = HERE(); 5081573Srgrimes register int first = 1; /* first subexpression? */ 5091573Srgrimes register int wasdollar = 0; 5101573Srgrimes 5111573Srgrimes if (EAT('^')) { 5121573Srgrimes EMIT(OBOL, 0); 5131573Srgrimes p->g->iflags |= USEBOL; 5141573Srgrimes p->g->nbol++; 5151573Srgrimes } 5161573Srgrimes while (MORE() && !SEETWO(end1, end2)) { 5171573Srgrimes wasdollar = p_simp_re(p, first); 5181573Srgrimes first = 0; 5191573Srgrimes } 5201573Srgrimes if (wasdollar) { /* oops, that was a trailing anchor */ 5211573Srgrimes DROP(1); 5221573Srgrimes EMIT(OEOL, 0); 5231573Srgrimes p->g->iflags |= USEEOL; 5241573Srgrimes p->g->neol++; 5251573Srgrimes } 5261573Srgrimes 52717141Sjkh (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 5281573Srgrimes} 5291573Srgrimes 5301573Srgrimes/* 5311573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 5321573Srgrimes == static int p_simp_re(register struct parse *p, int starordinary); 5331573Srgrimes */ 5341573Srgrimesstatic int /* was the simple RE an unbackslashed $? */ 5351573Srgrimesp_simp_re(p, starordinary) 5361573Srgrimesregister struct parse *p; 5371573Srgrimesint starordinary; /* is a leading * an ordinary character? */ 5381573Srgrimes{ 5391573Srgrimes register int c; 5401573Srgrimes register int count; 5411573Srgrimes register int count2; 5421573Srgrimes register sopno pos; 5431573Srgrimes register int i; 5441573Srgrimes register sopno subno; 5451573Srgrimes# define BACKSL (1<<CHAR_BIT) 5461573Srgrimes 5471573Srgrimes pos = HERE(); /* repetion op, if any, covers from here */ 5481573Srgrimes 5491573Srgrimes assert(MORE()); /* caller should have ensured this */ 5501573Srgrimes c = GETNEXT(); 5511573Srgrimes if (c == '\\') { 55217141Sjkh (void)REQUIRE(MORE(), REG_EESCAPE); 5531573Srgrimes c = BACKSL | (unsigned char)GETNEXT(); 5541573Srgrimes } 5551573Srgrimes switch (c) { 5561573Srgrimes case '.': 5571573Srgrimes if (p->g->cflags®_NEWLINE) 5581573Srgrimes nonnewline(p); 5591573Srgrimes else 5601573Srgrimes EMIT(OANY, 0); 5611573Srgrimes break; 5621573Srgrimes case '[': 5631573Srgrimes p_bracket(p); 5641573Srgrimes break; 5651573Srgrimes case BACKSL|'{': 5661573Srgrimes SETERROR(REG_BADRPT); 5671573Srgrimes break; 5681573Srgrimes case BACKSL|'(': 5691573Srgrimes p->g->nsub++; 5701573Srgrimes subno = p->g->nsub; 5711573Srgrimes if (subno < NPAREN) 5721573Srgrimes p->pbegin[subno] = HERE(); 5731573Srgrimes EMIT(OLPAREN, subno); 5741573Srgrimes /* the MORE here is an error heuristic */ 5751573Srgrimes if (MORE() && !SEETWO('\\', ')')) 5761573Srgrimes p_bre(p, '\\', ')'); 5771573Srgrimes if (subno < NPAREN) { 5781573Srgrimes p->pend[subno] = HERE(); 5791573Srgrimes assert(p->pend[subno] != 0); 5801573Srgrimes } 5811573Srgrimes EMIT(ORPAREN, subno); 58217141Sjkh (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 5831573Srgrimes break; 5841573Srgrimes case BACKSL|')': /* should not get here -- must be user */ 5851573Srgrimes case BACKSL|'}': 5861573Srgrimes SETERROR(REG_EPAREN); 5871573Srgrimes break; 5881573Srgrimes case BACKSL|'1': 5891573Srgrimes case BACKSL|'2': 5901573Srgrimes case BACKSL|'3': 5911573Srgrimes case BACKSL|'4': 5921573Srgrimes case BACKSL|'5': 5931573Srgrimes case BACKSL|'6': 5941573Srgrimes case BACKSL|'7': 5951573Srgrimes case BACKSL|'8': 5961573Srgrimes case BACKSL|'9': 5971573Srgrimes i = (c&~BACKSL) - '0'; 5981573Srgrimes assert(i < NPAREN); 5991573Srgrimes if (p->pend[i] != 0) { 6001573Srgrimes assert(i <= p->g->nsub); 6011573Srgrimes EMIT(OBACK_, i); 6021573Srgrimes assert(p->pbegin[i] != 0); 6031573Srgrimes assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 6041573Srgrimes assert(OP(p->strip[p->pend[i]]) == ORPAREN); 6051573Srgrimes (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 6061573Srgrimes EMIT(O_BACK, i); 6071573Srgrimes } else 6081573Srgrimes SETERROR(REG_ESUBREG); 6091573Srgrimes p->g->backrefs = 1; 6101573Srgrimes break; 6111573Srgrimes case '*': 61217141Sjkh (void)REQUIRE(starordinary, REG_BADRPT); 6131573Srgrimes /* FALLTHROUGH */ 6141573Srgrimes default: 6151573Srgrimes ordinary(p, c &~ BACKSL); 6161573Srgrimes break; 6171573Srgrimes } 6181573Srgrimes 6191573Srgrimes if (EAT('*')) { /* implemented as +? */ 6201573Srgrimes /* this case does not require the (y|) trick, noKLUDGE */ 6211573Srgrimes INSERT(OPLUS_, pos); 6221573Srgrimes ASTERN(O_PLUS, pos); 6231573Srgrimes INSERT(OQUEST_, pos); 6241573Srgrimes ASTERN(O_QUEST, pos); 6251573Srgrimes } else if (EATTWO('\\', '{')) { 6261573Srgrimes count = p_count(p); 6271573Srgrimes if (EAT(',')) { 6281573Srgrimes if (MORE() && isdigit(PEEK())) { 6291573Srgrimes count2 = p_count(p); 63017141Sjkh (void)REQUIRE(count <= count2, REG_BADBR); 6311573Srgrimes } else /* single number with comma */ 6321573Srgrimes count2 = INFINITY; 6331573Srgrimes } else /* just a single number */ 6341573Srgrimes count2 = count; 6351573Srgrimes repeat(p, pos, count, count2); 6361573Srgrimes if (!EATTWO('\\', '}')) { /* error heuristics */ 6371573Srgrimes while (MORE() && !SEETWO('\\', '}')) 6381573Srgrimes NEXT(); 63917141Sjkh (void)REQUIRE(MORE(), REG_EBRACE); 6401573Srgrimes SETERROR(REG_BADBR); 6411573Srgrimes } 6421573Srgrimes } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 6431573Srgrimes return(1); 6441573Srgrimes 6451573Srgrimes return(0); 6461573Srgrimes} 6471573Srgrimes 6481573Srgrimes/* 6491573Srgrimes - p_count - parse a repetition count 6501573Srgrimes == static int p_count(register struct parse *p); 6511573Srgrimes */ 6521573Srgrimesstatic int /* the value */ 6531573Srgrimesp_count(p) 6541573Srgrimesregister struct parse *p; 6551573Srgrimes{ 6561573Srgrimes register int count = 0; 6571573Srgrimes register int ndigits = 0; 6581573Srgrimes 6591573Srgrimes while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 6601573Srgrimes count = count*10 + (GETNEXT() - '0'); 6611573Srgrimes ndigits++; 6621573Srgrimes } 6631573Srgrimes 66417141Sjkh (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 6651573Srgrimes return(count); 6661573Srgrimes} 6671573Srgrimes 6681573Srgrimes/* 6691573Srgrimes - p_bracket - parse a bracketed character list 6701573Srgrimes == static void p_bracket(register struct parse *p); 6711573Srgrimes * 6721573Srgrimes * Note a significant property of this code: if the allocset() did SETERROR, 6731573Srgrimes * no set operations are done. 6741573Srgrimes */ 6751573Srgrimesstatic void 6761573Srgrimesp_bracket(p) 6771573Srgrimesregister struct parse *p; 6781573Srgrimes{ 6791573Srgrimes register cset *cs = allocset(p); 6801573Srgrimes register int invert = 0; 6811573Srgrimes 6821573Srgrimes /* Dept of Truly Sickening Special-Case Kludges */ 6831573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 6841573Srgrimes EMIT(OBOW, 0); 6851573Srgrimes NEXTn(6); 6861573Srgrimes return; 6871573Srgrimes } 6881573Srgrimes if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 6891573Srgrimes EMIT(OEOW, 0); 6901573Srgrimes NEXTn(6); 6911573Srgrimes return; 6921573Srgrimes } 6931573Srgrimes 6941573Srgrimes if (EAT('^')) 6951573Srgrimes invert++; /* make note to invert set at end */ 6961573Srgrimes if (EAT(']')) 6971573Srgrimes CHadd(cs, ']'); 6981573Srgrimes else if (EAT('-')) 6991573Srgrimes CHadd(cs, '-'); 7001573Srgrimes while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 7011573Srgrimes p_b_term(p, cs); 7021573Srgrimes if (EAT('-')) 7031573Srgrimes CHadd(cs, '-'); 70417141Sjkh (void)MUSTEAT(']', REG_EBRACK); 7051573Srgrimes 7061573Srgrimes if (p->error != 0) /* don't mess things up further */ 7071573Srgrimes return; 7081573Srgrimes 7091573Srgrimes if (p->g->cflags®_ICASE) { 7101573Srgrimes register int i; 7111573Srgrimes register int ci; 7121573Srgrimes 7131573Srgrimes for (i = p->g->csetsize - 1; i >= 0; i--) 7141573Srgrimes if (CHIN(cs, i) && isalpha(i)) { 7151573Srgrimes ci = othercase(i); 7161573Srgrimes if (ci != i) 7171573Srgrimes CHadd(cs, ci); 7181573Srgrimes } 7191573Srgrimes if (cs->multis != NULL) 7201573Srgrimes mccase(p, cs); 7211573Srgrimes } 7221573Srgrimes if (invert) { 7231573Srgrimes register int i; 7241573Srgrimes 7251573Srgrimes for (i = p->g->csetsize - 1; i >= 0; i--) 7261573Srgrimes if (CHIN(cs, i)) 7271573Srgrimes CHsub(cs, i); 7281573Srgrimes else 7291573Srgrimes CHadd(cs, i); 7301573Srgrimes if (p->g->cflags®_NEWLINE) 7311573Srgrimes CHsub(cs, '\n'); 7321573Srgrimes if (cs->multis != NULL) 7331573Srgrimes mcinvert(p, cs); 7341573Srgrimes } 7351573Srgrimes 7361573Srgrimes assert(cs->multis == NULL); /* xxx */ 7371573Srgrimes 7381573Srgrimes if (nch(p, cs) == 1) { /* optimize singleton sets */ 7391573Srgrimes ordinary(p, firstch(p, cs)); 7401573Srgrimes freeset(p, cs); 7411573Srgrimes } else 7421573Srgrimes EMIT(OANYOF, freezeset(p, cs)); 7431573Srgrimes} 7441573Srgrimes 74517514Sachestatic int collcmp (c1, c2) 74617514Sacheint c1, c2; 74717514Sache{ 74817514Sache static char s1[2], s2[2]; 74917514Sache 75017514Sache if (c1 == c2) 75117514Sache return (0); 75217514Sache if ( (isascii(c1) && isascii(c2)) 75317514Sache || (!isalpha(c1) && !isalpha(c2)) 75417514Sache ) 75517514Sache return (c1 - c2); 75617514Sache if (isalpha(c1) && !isalpha(c2)) { 75717514Sache if (isupper(c1)) 75817514Sache return ('A' - c2); 75917514Sache else 76017514Sache return ('a' - c2); 76117514Sache } else if (isalpha(c2) && !isalpha(c1)) { 76217514Sache if (isupper(c2)) 76317514Sache return (c1 - 'A'); 76417514Sache else 76517514Sache return (c1 - 'a'); 76617514Sache } 76717514Sache if (isupper(c1) && islower(c2)) 76817514Sache return (-1); 76917514Sache else if (islower(c1) && isupper(c2)) 77017514Sache return (1); 77117514Sache s1[0] = c1; 77217514Sache s2[0] = c2; 77317514Sache return strcoll(s1, s2); 77417514Sache} 77517514Sache 7761573Srgrimes/* 7771573Srgrimes - p_b_term - parse one term of a bracketed character list 7781573Srgrimes == static void p_b_term(register struct parse *p, register cset *cs); 7791573Srgrimes */ 7801573Srgrimesstatic void 7811573Srgrimesp_b_term(p, cs) 7821573Srgrimesregister struct parse *p; 7831573Srgrimesregister cset *cs; 7841573Srgrimes{ 7851573Srgrimes register char c; 7861573Srgrimes register char start, finish; 7871573Srgrimes register int i; 7881573Srgrimes 7891573Srgrimes /* classify what we've got */ 7901573Srgrimes switch ((MORE()) ? PEEK() : '\0') { 7911573Srgrimes case '[': 7921573Srgrimes c = (MORE2()) ? PEEK2() : '\0'; 7931573Srgrimes break; 7941573Srgrimes case '-': 7951573Srgrimes SETERROR(REG_ERANGE); 7961573Srgrimes return; /* NOTE RETURN */ 7971573Srgrimes break; 7981573Srgrimes default: 7991573Srgrimes c = '\0'; 8001573Srgrimes break; 8011573Srgrimes } 8021573Srgrimes 8031573Srgrimes switch (c) { 8041573Srgrimes case ':': /* character class */ 8051573Srgrimes NEXT2(); 80617141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 8071573Srgrimes c = PEEK(); 80817141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 8091573Srgrimes p_b_cclass(p, cs); 81017141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 81117141Sjkh (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 8121573Srgrimes break; 8131573Srgrimes case '=': /* equivalence class */ 8141573Srgrimes NEXT2(); 81517141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 8161573Srgrimes c = PEEK(); 81717141Sjkh (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 8181573Srgrimes p_b_eclass(p, cs); 81917141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 82017141Sjkh (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 8211573Srgrimes break; 8221573Srgrimes default: /* symbol, ordinary character, or range */ 8231573Srgrimes/* xxx revision needed for multichar stuff */ 8241573Srgrimes start = p_b_symbol(p); 8251573Srgrimes if (SEE('-') && MORE2() && PEEK2() != ']') { 8261573Srgrimes /* range */ 8271573Srgrimes NEXT(); 8281573Srgrimes if (EAT('-')) 8291573Srgrimes finish = '-'; 8301573Srgrimes else 8311573Srgrimes finish = p_b_symbol(p); 8321573Srgrimes } else 8331573Srgrimes finish = start; 83417514Sache if (start == finish) 83517514Sache CHadd(cs, start); 83617514Sache else { 83717514Sache uch s = (uch)start, f = (uch)finish; 83817514Sache 83917514Sache (void)REQUIRE(collcmp(s, f) <= 0, REG_ERANGE); 84017514Sache for (i = CHAR_MIN; i <= CHAR_MAX; i++) { 84117514Sache if ( collcmp(s, (uch)i) <= 0 84217514Sache && collcmp((uch)i, f) <= 0 84317514Sache ) 84417514Sache CHadd(cs, i); 84517514Sache } 84617514Sache } 8471573Srgrimes break; 8481573Srgrimes } 8491573Srgrimes} 8501573Srgrimes 8511573Srgrimes/* 8521573Srgrimes - p_b_cclass - parse a character-class name and deal with it 8531573Srgrimes == static void p_b_cclass(register struct parse *p, register cset *cs); 8541573Srgrimes */ 8551573Srgrimesstatic void 8561573Srgrimesp_b_cclass(p, cs) 8571573Srgrimesregister struct parse *p; 8581573Srgrimesregister cset *cs; 8591573Srgrimes{ 86017508Sache register int c; 8611573Srgrimes register char *sp = p->next; 8621573Srgrimes register struct cclass *cp; 8631573Srgrimes register size_t len; 8641573Srgrimes 86517508Sache while (MORE() && isalpha((uch)PEEK())) 8661573Srgrimes NEXT(); 8671573Srgrimes len = p->next - sp; 8681573Srgrimes for (cp = cclasses; cp->name != NULL; cp++) 8691573Srgrimes if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 8701573Srgrimes break; 8711573Srgrimes if (cp->name == NULL) { 8721573Srgrimes /* oops, didn't find it */ 8731573Srgrimes SETERROR(REG_ECTYPE); 8741573Srgrimes return; 8751573Srgrimes } 8761573Srgrimes 87717508Sache switch (cp->fidx) { 87817508Sache case CALNUM: 87917508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 88017508Sache if (isalnum((uch)c)) 88117508Sache CHadd(cs, c); 88217508Sache break; 88317508Sache case CALPHA: 88417508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 88517508Sache if (isalpha((uch)c)) 88617508Sache CHadd(cs, c); 88717508Sache break; 88817508Sache case CBLANK: 88917508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 89017508Sache if (isblank((uch)c)) 89117508Sache CHadd(cs, c); 89217508Sache break; 89317508Sache case CCNTRL: 89417508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 89517508Sache if (iscntrl((uch)c)) 89617508Sache CHadd(cs, c); 89717508Sache break; 89817508Sache case CDIGIT: 89917508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 90017508Sache if (isdigit((uch)c)) 90117508Sache CHadd(cs, c); 90217508Sache break; 90317508Sache case CGRAPH: 90417508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 90517508Sache if (isgraph((uch)c)) 90617508Sache CHadd(cs, c); 90717508Sache break; 90817508Sache case CLOWER: 90917508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 91017508Sache if (islower((uch)c)) 91117508Sache CHadd(cs, c); 91217508Sache break; 91317508Sache case CPRINT: 91417508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 91517508Sache if (isprint((uch)c)) 91617508Sache CHadd(cs, c); 91717508Sache break; 91817508Sache case CPUNCT: 91917508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 92017508Sache if (ispunct((uch)c)) 92117508Sache CHadd(cs, c); 92217508Sache break; 92317508Sache case CSPACE: 92417508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 92517508Sache if (isspace((uch)c)) 92617508Sache CHadd(cs, c); 92717508Sache break; 92817508Sache case CUPPER: 92917508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 93017508Sache if (isupper((uch)c)) 93117508Sache CHadd(cs, c); 93217508Sache break; 93317508Sache case CXDIGIT: 93417508Sache for (c = CHAR_MIN; c <= CHAR_MAX; c++) 93517508Sache if (isxdigit((uch)c)) 93617508Sache CHadd(cs, c); 93717508Sache break; 93817508Sache } 93917508Sache#if 0 9401573Srgrimes for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 9411573Srgrimes MCadd(p, cs, u); 94217508Sache#endif 9431573Srgrimes} 9441573Srgrimes 9451573Srgrimes/* 9461573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it 9471573Srgrimes == static void p_b_eclass(register struct parse *p, register cset *cs); 9481573Srgrimes * 9491573Srgrimes * This implementation is incomplete. xxx 9501573Srgrimes */ 9511573Srgrimesstatic void 9521573Srgrimesp_b_eclass(p, cs) 9531573Srgrimesregister struct parse *p; 9541573Srgrimesregister cset *cs; 9551573Srgrimes{ 9561573Srgrimes register char c; 9571573Srgrimes 9581573Srgrimes c = p_b_coll_elem(p, '='); 9591573Srgrimes CHadd(cs, c); 9601573Srgrimes} 9611573Srgrimes 9621573Srgrimes/* 9631573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 9641573Srgrimes == static char p_b_symbol(register struct parse *p); 9651573Srgrimes */ 9661573Srgrimesstatic char /* value of symbol */ 9671573Srgrimesp_b_symbol(p) 9681573Srgrimesregister struct parse *p; 9691573Srgrimes{ 9701573Srgrimes register char value; 9711573Srgrimes 97217141Sjkh (void)REQUIRE(MORE(), REG_EBRACK); 9731573Srgrimes if (!EATTWO('[', '.')) 9741573Srgrimes return(GETNEXT()); 9751573Srgrimes 9761573Srgrimes /* collating symbol */ 9771573Srgrimes value = p_b_coll_elem(p, '.'); 97817141Sjkh (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 9791573Srgrimes return(value); 9801573Srgrimes} 9811573Srgrimes 9821573Srgrimes/* 9831573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up 9841573Srgrimes == static char p_b_coll_elem(register struct parse *p, int endc); 9851573Srgrimes */ 9861573Srgrimesstatic char /* value of collating element */ 9871573Srgrimesp_b_coll_elem(p, endc) 9881573Srgrimesregister struct parse *p; 9891573Srgrimesint endc; /* name ended by endc,']' */ 9901573Srgrimes{ 9911573Srgrimes register char *sp = p->next; 9921573Srgrimes register struct cname *cp; 9931573Srgrimes register int len; 9941573Srgrimes 9951573Srgrimes while (MORE() && !SEETWO(endc, ']')) 9961573Srgrimes NEXT(); 9971573Srgrimes if (!MORE()) { 9981573Srgrimes SETERROR(REG_EBRACK); 9991573Srgrimes return(0); 10001573Srgrimes } 10011573Srgrimes len = p->next - sp; 10021573Srgrimes for (cp = cnames; cp->name != NULL; cp++) 10031573Srgrimes if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 10041573Srgrimes return(cp->code); /* known name */ 10051573Srgrimes if (len == 1) 10061573Srgrimes return(*sp); /* single character */ 10071573Srgrimes SETERROR(REG_ECOLLATE); /* neither */ 10081573Srgrimes return(0); 10091573Srgrimes} 10101573Srgrimes 10111573Srgrimes/* 10121573Srgrimes - othercase - return the case counterpart of an alphabetic 10131573Srgrimes == static char othercase(int ch); 10141573Srgrimes */ 10151573Srgrimesstatic char /* if no counterpart, return ch */ 10161573Srgrimesothercase(ch) 10171573Srgrimesint ch; 10181573Srgrimes{ 101914815Sache ch = (unsigned char)ch; 10201573Srgrimes assert(isalpha(ch)); 10211573Srgrimes if (isupper(ch)) 10221573Srgrimes return(tolower(ch)); 10231573Srgrimes else if (islower(ch)) 10241573Srgrimes return(toupper(ch)); 10251573Srgrimes else /* peculiar, but could happen */ 10261573Srgrimes return(ch); 10271573Srgrimes} 10281573Srgrimes 10291573Srgrimes/* 10301573Srgrimes - bothcases - emit a dualcase version of a two-case character 10311573Srgrimes == static void bothcases(register struct parse *p, int ch); 10321573Srgrimes * 10331573Srgrimes * Boy, is this implementation ever a kludge... 10341573Srgrimes */ 10351573Srgrimesstatic void 10361573Srgrimesbothcases(p, ch) 10371573Srgrimesregister struct parse *p; 10381573Srgrimesint ch; 10391573Srgrimes{ 10401573Srgrimes register char *oldnext = p->next; 10411573Srgrimes register char *oldend = p->end; 10421573Srgrimes char bracket[3]; 10431573Srgrimes 104414815Sache ch = (unsigned char)ch; 10451573Srgrimes assert(othercase(ch) != ch); /* p_bracket() would recurse */ 10461573Srgrimes p->next = bracket; 10471573Srgrimes p->end = bracket+2; 10481573Srgrimes bracket[0] = ch; 10491573Srgrimes bracket[1] = ']'; 10501573Srgrimes bracket[2] = '\0'; 10511573Srgrimes p_bracket(p); 10521573Srgrimes assert(p->next == bracket+2); 10531573Srgrimes p->next = oldnext; 10541573Srgrimes p->end = oldend; 10551573Srgrimes} 10561573Srgrimes 10571573Srgrimes/* 10581573Srgrimes - ordinary - emit an ordinary character 10591573Srgrimes == static void ordinary(register struct parse *p, register int ch); 10601573Srgrimes */ 10611573Srgrimesstatic void 10621573Srgrimesordinary(p, ch) 10631573Srgrimesregister struct parse *p; 10641573Srgrimesregister int ch; 10651573Srgrimes{ 10661573Srgrimes register cat_t *cap = p->g->categories; 10671573Srgrimes 106814815Sache if ((p->g->cflags®_ICASE) && isalpha((unsigned char)ch) && othercase(ch) != ch) 10691573Srgrimes bothcases(p, ch); 10701573Srgrimes else { 10711573Srgrimes EMIT(OCHAR, (unsigned char)ch); 10721573Srgrimes if (cap[ch] == 0) 10731573Srgrimes cap[ch] = p->g->ncategories++; 10741573Srgrimes } 10751573Srgrimes} 10761573Srgrimes 10771573Srgrimes/* 10781573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY 10791573Srgrimes == static void nonnewline(register struct parse *p); 10801573Srgrimes * 10811573Srgrimes * Boy, is this implementation ever a kludge... 10821573Srgrimes */ 10831573Srgrimesstatic void 10841573Srgrimesnonnewline(p) 10851573Srgrimesregister struct parse *p; 10861573Srgrimes{ 10871573Srgrimes register char *oldnext = p->next; 10881573Srgrimes register char *oldend = p->end; 10891573Srgrimes char bracket[4]; 10901573Srgrimes 10911573Srgrimes p->next = bracket; 10921573Srgrimes p->end = bracket+3; 10931573Srgrimes bracket[0] = '^'; 10941573Srgrimes bracket[1] = '\n'; 10951573Srgrimes bracket[2] = ']'; 10961573Srgrimes bracket[3] = '\0'; 10971573Srgrimes p_bracket(p); 10981573Srgrimes assert(p->next == bracket+3); 10991573Srgrimes p->next = oldnext; 11001573Srgrimes p->end = oldend; 11011573Srgrimes} 11021573Srgrimes 11031573Srgrimes/* 11041573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed 11051573Srgrimes == static void repeat(register struct parse *p, sopno start, int from, int to); 11061573Srgrimes */ 11071573Srgrimesstatic void 11081573Srgrimesrepeat(p, start, from, to) 11091573Srgrimesregister struct parse *p; 11101573Srgrimessopno start; /* operand from here to end of strip */ 11111573Srgrimesint from; /* repeated from this number */ 11121573Srgrimesint to; /* to this number of times (maybe INFINITY) */ 11131573Srgrimes{ 11141573Srgrimes register sopno finish = HERE(); 11151573Srgrimes# define N 2 11161573Srgrimes# define INF 3 11171573Srgrimes# define REP(f, t) ((f)*8 + (t)) 11181573Srgrimes# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 11191573Srgrimes register sopno copy; 11201573Srgrimes 11211573Srgrimes if (p->error != 0) /* head off possible runaway recursion */ 11221573Srgrimes return; 11231573Srgrimes 11241573Srgrimes assert(from <= to); 11251573Srgrimes 11261573Srgrimes switch (REP(MAP(from), MAP(to))) { 11271573Srgrimes case REP(0, 0): /* must be user doing this */ 11281573Srgrimes DROP(finish-start); /* drop the operand */ 11291573Srgrimes break; 11301573Srgrimes case REP(0, 1): /* as x{1,1}? */ 11311573Srgrimes case REP(0, N): /* as x{1,n}? */ 11321573Srgrimes case REP(0, INF): /* as x{1,}? */ 11331573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 11341573Srgrimes INSERT(OCH_, start); /* offset is wrong... */ 11351573Srgrimes repeat(p, start+1, 1, to); 11361573Srgrimes ASTERN(OOR1, start); 11371573Srgrimes AHEAD(start); /* ... fix it */ 11381573Srgrimes EMIT(OOR2, 0); 11391573Srgrimes AHEAD(THERE()); 11401573Srgrimes ASTERN(O_CH, THERETHERE()); 11411573Srgrimes break; 11421573Srgrimes case REP(1, 1): /* trivial case */ 11431573Srgrimes /* done */ 11441573Srgrimes break; 11451573Srgrimes case REP(1, N): /* as x?x{1,n-1} */ 11461573Srgrimes /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 11471573Srgrimes INSERT(OCH_, start); 11481573Srgrimes ASTERN(OOR1, start); 11491573Srgrimes AHEAD(start); 11501573Srgrimes EMIT(OOR2, 0); /* offset very wrong... */ 11511573Srgrimes AHEAD(THERE()); /* ...so fix it */ 11521573Srgrimes ASTERN(O_CH, THERETHERE()); 11531573Srgrimes copy = dupl(p, start+1, finish+1); 11541573Srgrimes assert(copy == finish+4); 11551573Srgrimes repeat(p, copy, 1, to-1); 11561573Srgrimes break; 11571573Srgrimes case REP(1, INF): /* as x+ */ 11581573Srgrimes INSERT(OPLUS_, start); 11591573Srgrimes ASTERN(O_PLUS, start); 11601573Srgrimes break; 11611573Srgrimes case REP(N, N): /* as xx{m-1,n-1} */ 11621573Srgrimes copy = dupl(p, start, finish); 11631573Srgrimes repeat(p, copy, from-1, to-1); 11641573Srgrimes break; 11651573Srgrimes case REP(N, INF): /* as xx{n-1,INF} */ 11661573Srgrimes copy = dupl(p, start, finish); 11671573Srgrimes repeat(p, copy, from-1, to); 11681573Srgrimes break; 11691573Srgrimes default: /* "can't happen" */ 11701573Srgrimes SETERROR(REG_ASSERT); /* just in case */ 11711573Srgrimes break; 11721573Srgrimes } 11731573Srgrimes} 11741573Srgrimes 11751573Srgrimes/* 11761573Srgrimes - seterr - set an error condition 11771573Srgrimes == static int seterr(register struct parse *p, int e); 11781573Srgrimes */ 11791573Srgrimesstatic int /* useless but makes type checking happy */ 11801573Srgrimesseterr(p, e) 11811573Srgrimesregister struct parse *p; 11821573Srgrimesint e; 11831573Srgrimes{ 11841573Srgrimes if (p->error == 0) /* keep earliest error condition */ 11851573Srgrimes p->error = e; 11861573Srgrimes p->next = nuls; /* try to bring things to a halt */ 11871573Srgrimes p->end = nuls; 11881573Srgrimes return(0); /* make the return value well-defined */ 11891573Srgrimes} 11901573Srgrimes 11911573Srgrimes/* 11921573Srgrimes - allocset - allocate a set of characters for [] 11931573Srgrimes == static cset *allocset(register struct parse *p); 11941573Srgrimes */ 11951573Srgrimesstatic cset * 11961573Srgrimesallocset(p) 11971573Srgrimesregister struct parse *p; 11981573Srgrimes{ 11991573Srgrimes register int no = p->g->ncsets++; 12001573Srgrimes register size_t nc; 12011573Srgrimes register size_t nbytes; 12021573Srgrimes register cset *cs; 12031573Srgrimes register size_t css = (size_t)p->g->csetsize; 12041573Srgrimes register int i; 12051573Srgrimes 12061573Srgrimes if (no >= p->ncsalloc) { /* need another column of space */ 12071573Srgrimes p->ncsalloc += CHAR_BIT; 12081573Srgrimes nc = p->ncsalloc; 12091573Srgrimes assert(nc % CHAR_BIT == 0); 12101573Srgrimes nbytes = nc / CHAR_BIT * css; 12111573Srgrimes if (p->g->sets == NULL) 12121573Srgrimes p->g->sets = (cset *)malloc(nc * sizeof(cset)); 12131573Srgrimes else 12141573Srgrimes p->g->sets = (cset *)realloc((char *)p->g->sets, 12151573Srgrimes nc * sizeof(cset)); 12161573Srgrimes if (p->g->setbits == NULL) 12171573Srgrimes p->g->setbits = (uch *)malloc(nbytes); 12181573Srgrimes else { 12191573Srgrimes p->g->setbits = (uch *)realloc((char *)p->g->setbits, 12201573Srgrimes nbytes); 12211573Srgrimes /* xxx this isn't right if setbits is now NULL */ 12221573Srgrimes for (i = 0; i < no; i++) 12231573Srgrimes p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 12241573Srgrimes } 12251573Srgrimes if (p->g->sets != NULL && p->g->setbits != NULL) 12261573Srgrimes (void) memset((char *)p->g->setbits + (nbytes - css), 12271573Srgrimes 0, css); 12281573Srgrimes else { 12291573Srgrimes no = 0; 12301573Srgrimes SETERROR(REG_ESPACE); 12311573Srgrimes /* caller's responsibility not to do set ops */ 12321573Srgrimes } 12331573Srgrimes } 12341573Srgrimes 12351573Srgrimes assert(p->g->sets != NULL); /* xxx */ 12361573Srgrimes cs = &p->g->sets[no]; 12371573Srgrimes cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 12381573Srgrimes cs->mask = 1 << ((no) % CHAR_BIT); 12391573Srgrimes cs->hash = 0; 12401573Srgrimes cs->smultis = 0; 12411573Srgrimes cs->multis = NULL; 12421573Srgrimes 12431573Srgrimes return(cs); 12441573Srgrimes} 12451573Srgrimes 12461573Srgrimes/* 12471573Srgrimes - freeset - free a now-unused set 12481573Srgrimes == static void freeset(register struct parse *p, register cset *cs); 12491573Srgrimes */ 12501573Srgrimesstatic void 12511573Srgrimesfreeset(p, cs) 12521573Srgrimesregister struct parse *p; 12531573Srgrimesregister cset *cs; 12541573Srgrimes{ 12551573Srgrimes register int i; 12561573Srgrimes register cset *top = &p->g->sets[p->g->ncsets]; 12571573Srgrimes register size_t css = (size_t)p->g->csetsize; 12581573Srgrimes 12591573Srgrimes for (i = 0; i < css; i++) 12601573Srgrimes CHsub(cs, i); 12611573Srgrimes if (cs == top-1) /* recover only the easy case */ 12621573Srgrimes p->g->ncsets--; 12631573Srgrimes} 12641573Srgrimes 12651573Srgrimes/* 12661573Srgrimes - freezeset - final processing on a set of characters 12671573Srgrimes == static int freezeset(register struct parse *p, register cset *cs); 12681573Srgrimes * 12691573Srgrimes * The main task here is merging identical sets. This is usually a waste 12701573Srgrimes * of time (although the hash code minimizes the overhead), but can win 12711573Srgrimes * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 12721573Srgrimes * is done using addition rather than xor -- all ASCII [aA] sets xor to 12731573Srgrimes * the same value! 12741573Srgrimes */ 12751573Srgrimesstatic int /* set number */ 12761573Srgrimesfreezeset(p, cs) 12771573Srgrimesregister struct parse *p; 12781573Srgrimesregister cset *cs; 12791573Srgrimes{ 128017509Sache register short h = cs->hash; 12811573Srgrimes register int i; 12821573Srgrimes register cset *top = &p->g->sets[p->g->ncsets]; 12831573Srgrimes register cset *cs2; 12841573Srgrimes register size_t css = (size_t)p->g->csetsize; 12851573Srgrimes 12861573Srgrimes /* look for an earlier one which is the same */ 12871573Srgrimes for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 12881573Srgrimes if (cs2->hash == h && cs2 != cs) { 12891573Srgrimes /* maybe */ 12901573Srgrimes for (i = 0; i < css; i++) 12911573Srgrimes if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 12921573Srgrimes break; /* no */ 12931573Srgrimes if (i == css) 12941573Srgrimes break; /* yes */ 12951573Srgrimes } 12961573Srgrimes 12971573Srgrimes if (cs2 < top) { /* found one */ 12981573Srgrimes freeset(p, cs); 12991573Srgrimes cs = cs2; 13001573Srgrimes } 13011573Srgrimes 13021573Srgrimes return((int)(cs - p->g->sets)); 13031573Srgrimes} 13041573Srgrimes 13051573Srgrimes/* 13061573Srgrimes - firstch - return first character in a set (which must have at least one) 13071573Srgrimes == static int firstch(register struct parse *p, register cset *cs); 13081573Srgrimes */ 13091573Srgrimesstatic int /* character; there is no "none" value */ 13101573Srgrimesfirstch(p, cs) 13111573Srgrimesregister struct parse *p; 13121573Srgrimesregister cset *cs; 13131573Srgrimes{ 13141573Srgrimes register int i; 13151573Srgrimes register size_t css = (size_t)p->g->csetsize; 13161573Srgrimes 13171573Srgrimes for (i = 0; i < css; i++) 13181573Srgrimes if (CHIN(cs, i)) 131914815Sache return((unsigned char)i); 13201573Srgrimes assert(never); 13211573Srgrimes return(0); /* arbitrary */ 13221573Srgrimes} 13231573Srgrimes 13241573Srgrimes/* 13251573Srgrimes - nch - number of characters in a set 13261573Srgrimes == static int nch(register struct parse *p, register cset *cs); 13271573Srgrimes */ 13281573Srgrimesstatic int 13291573Srgrimesnch(p, cs) 13301573Srgrimesregister struct parse *p; 13311573Srgrimesregister cset *cs; 13321573Srgrimes{ 13331573Srgrimes register int i; 13341573Srgrimes register size_t css = (size_t)p->g->csetsize; 13351573Srgrimes register int n = 0; 13361573Srgrimes 13371573Srgrimes for (i = 0; i < css; i++) 13381573Srgrimes if (CHIN(cs, i)) 13391573Srgrimes n++; 13401573Srgrimes return(n); 13411573Srgrimes} 13421573Srgrimes 13431573Srgrimes/* 13441573Srgrimes - mcadd - add a collating element to a cset 13451573Srgrimes == static void mcadd(register struct parse *p, register cset *cs, \ 13461573Srgrimes == register char *cp); 13471573Srgrimes */ 13481573Srgrimesstatic void 13491573Srgrimesmcadd(p, cs, cp) 13501573Srgrimesregister struct parse *p; 13511573Srgrimesregister cset *cs; 13521573Srgrimesregister char *cp; 13531573Srgrimes{ 13541573Srgrimes register size_t oldend = cs->smultis; 13551573Srgrimes 13561573Srgrimes cs->smultis += strlen(cp) + 1; 13571573Srgrimes if (cs->multis == NULL) 13581573Srgrimes cs->multis = malloc(cs->smultis); 13591573Srgrimes else 13601573Srgrimes cs->multis = realloc(cs->multis, cs->smultis); 13611573Srgrimes if (cs->multis == NULL) { 13621573Srgrimes SETERROR(REG_ESPACE); 13631573Srgrimes return; 13641573Srgrimes } 13651573Srgrimes 13661573Srgrimes (void) strcpy(cs->multis + oldend - 1, cp); 13671573Srgrimes cs->multis[cs->smultis - 1] = '\0'; 13681573Srgrimes} 13691573Srgrimes 137017141Sjkh#if used 13711573Srgrimes/* 13721573Srgrimes - mcsub - subtract a collating element from a cset 13731573Srgrimes == static void mcsub(register cset *cs, register char *cp); 13741573Srgrimes */ 13751573Srgrimesstatic void 13761573Srgrimesmcsub(cs, cp) 13771573Srgrimesregister cset *cs; 13781573Srgrimesregister char *cp; 13791573Srgrimes{ 13801573Srgrimes register char *fp = mcfind(cs, cp); 13811573Srgrimes register size_t len = strlen(fp); 13821573Srgrimes 13831573Srgrimes assert(fp != NULL); 13841573Srgrimes (void) memmove(fp, fp + len + 1, 13851573Srgrimes cs->smultis - (fp + len + 1 - cs->multis)); 13861573Srgrimes cs->smultis -= len; 13871573Srgrimes 13881573Srgrimes if (cs->smultis == 0) { 13891573Srgrimes free(cs->multis); 13901573Srgrimes cs->multis = NULL; 13911573Srgrimes return; 13921573Srgrimes } 13931573Srgrimes 13941573Srgrimes cs->multis = realloc(cs->multis, cs->smultis); 13951573Srgrimes assert(cs->multis != NULL); 13961573Srgrimes} 13971573Srgrimes 13981573Srgrimes/* 13991573Srgrimes - mcin - is a collating element in a cset? 14001573Srgrimes == static int mcin(register cset *cs, register char *cp); 14011573Srgrimes */ 14021573Srgrimesstatic int 14031573Srgrimesmcin(cs, cp) 14041573Srgrimesregister cset *cs; 14051573Srgrimesregister char *cp; 14061573Srgrimes{ 14071573Srgrimes return(mcfind(cs, cp) != NULL); 14081573Srgrimes} 14091573Srgrimes 14101573Srgrimes/* 14111573Srgrimes - mcfind - find a collating element in a cset 14121573Srgrimes == static char *mcfind(register cset *cs, register char *cp); 14131573Srgrimes */ 14141573Srgrimesstatic char * 14151573Srgrimesmcfind(cs, cp) 14161573Srgrimesregister cset *cs; 14171573Srgrimesregister char *cp; 14181573Srgrimes{ 14191573Srgrimes register char *p; 14201573Srgrimes 14211573Srgrimes if (cs->multis == NULL) 14221573Srgrimes return(NULL); 14231573Srgrimes for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 14241573Srgrimes if (strcmp(cp, p) == 0) 14251573Srgrimes return(p); 14261573Srgrimes return(NULL); 14271573Srgrimes} 142817141Sjkh#endif 14291573Srgrimes 14301573Srgrimes/* 14311573Srgrimes - mcinvert - invert the list of collating elements in a cset 14321573Srgrimes == static void mcinvert(register struct parse *p, register cset *cs); 14331573Srgrimes * 14341573Srgrimes * This would have to know the set of possibilities. Implementation 14351573Srgrimes * is deferred. 14361573Srgrimes */ 14371573Srgrimesstatic void 14381573Srgrimesmcinvert(p, cs) 14391573Srgrimesregister struct parse *p; 14401573Srgrimesregister cset *cs; 14411573Srgrimes{ 14421573Srgrimes assert(cs->multis == NULL); /* xxx */ 14431573Srgrimes} 14441573Srgrimes 14451573Srgrimes/* 14461573Srgrimes - mccase - add case counterparts of the list of collating elements in a cset 14471573Srgrimes == static void mccase(register struct parse *p, register cset *cs); 14481573Srgrimes * 14491573Srgrimes * This would have to know the set of possibilities. Implementation 14501573Srgrimes * is deferred. 14511573Srgrimes */ 14521573Srgrimesstatic void 14531573Srgrimesmccase(p, cs) 14541573Srgrimesregister struct parse *p; 14551573Srgrimesregister cset *cs; 14561573Srgrimes{ 14571573Srgrimes assert(cs->multis == NULL); /* xxx */ 14581573Srgrimes} 14591573Srgrimes 14601573Srgrimes/* 14611573Srgrimes - isinsets - is this character in any sets? 14621573Srgrimes == static int isinsets(register struct re_guts *g, int c); 14631573Srgrimes */ 14641573Srgrimesstatic int /* predicate */ 14651573Srgrimesisinsets(g, c) 14661573Srgrimesregister struct re_guts *g; 14671573Srgrimesint c; 14681573Srgrimes{ 14691573Srgrimes register uch *col; 14701573Srgrimes register int i; 14711573Srgrimes register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 14721573Srgrimes register unsigned uc = (unsigned char)c; 14731573Srgrimes 14741573Srgrimes for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 14751573Srgrimes if (col[uc] != 0) 14761573Srgrimes return(1); 14771573Srgrimes return(0); 14781573Srgrimes} 14791573Srgrimes 14801573Srgrimes/* 14811573Srgrimes - samesets - are these two characters in exactly the same sets? 14821573Srgrimes == static int samesets(register struct re_guts *g, int c1, int c2); 14831573Srgrimes */ 14841573Srgrimesstatic int /* predicate */ 14851573Srgrimessamesets(g, c1, c2) 14861573Srgrimesregister struct re_guts *g; 14871573Srgrimesint c1; 14881573Srgrimesint c2; 14891573Srgrimes{ 14901573Srgrimes register uch *col; 14911573Srgrimes register int i; 14921573Srgrimes register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 14931573Srgrimes register unsigned uc1 = (unsigned char)c1; 14941573Srgrimes register unsigned uc2 = (unsigned char)c2; 14951573Srgrimes 14961573Srgrimes for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 14971573Srgrimes if (col[uc1] != col[uc2]) 14981573Srgrimes return(0); 14991573Srgrimes return(1); 15001573Srgrimes} 15011573Srgrimes 15021573Srgrimes/* 15031573Srgrimes - categorize - sort out character categories 15041573Srgrimes == static void categorize(struct parse *p, register struct re_guts *g); 15051573Srgrimes */ 15061573Srgrimesstatic void 15071573Srgrimescategorize(p, g) 15081573Srgrimesstruct parse *p; 15091573Srgrimesregister struct re_guts *g; 15101573Srgrimes{ 15111573Srgrimes register cat_t *cats = g->categories; 15121573Srgrimes register int c; 15131573Srgrimes register int c2; 15141573Srgrimes register cat_t cat; 15151573Srgrimes 15161573Srgrimes /* avoid making error situations worse */ 15171573Srgrimes if (p->error != 0) 15181573Srgrimes return; 15191573Srgrimes 15201573Srgrimes for (c = CHAR_MIN; c <= CHAR_MAX; c++) 15211573Srgrimes if (cats[c] == 0 && isinsets(g, c)) { 15221573Srgrimes cat = g->ncategories++; 15231573Srgrimes cats[c] = cat; 15241573Srgrimes for (c2 = c+1; c2 <= CHAR_MAX; c2++) 15251573Srgrimes if (cats[c2] == 0 && samesets(g, c, c2)) 15261573Srgrimes cats[c2] = cat; 15271573Srgrimes } 15281573Srgrimes} 15291573Srgrimes 15301573Srgrimes/* 15311573Srgrimes - dupl - emit a duplicate of a bunch of sops 15321573Srgrimes == static sopno dupl(register struct parse *p, sopno start, sopno finish); 15331573Srgrimes */ 15341573Srgrimesstatic sopno /* start of duplicate */ 15351573Srgrimesdupl(p, start, finish) 15361573Srgrimesregister struct parse *p; 15371573Srgrimessopno start; /* from here */ 15381573Srgrimessopno finish; /* to this less one */ 15391573Srgrimes{ 15401573Srgrimes register sopno ret = HERE(); 15411573Srgrimes register sopno len = finish - start; 15421573Srgrimes 15431573Srgrimes assert(finish >= start); 15441573Srgrimes if (len == 0) 15451573Srgrimes return(ret); 15461573Srgrimes enlarge(p, p->ssize + len); /* this many unexpected additions */ 15471573Srgrimes assert(p->ssize >= p->slen + len); 15481573Srgrimes (void) memcpy((char *)(p->strip + p->slen), 15491573Srgrimes (char *)(p->strip + start), (size_t)len*sizeof(sop)); 15501573Srgrimes p->slen += len; 15511573Srgrimes return(ret); 15521573Srgrimes} 15531573Srgrimes 15541573Srgrimes/* 15551573Srgrimes - doemit - emit a strip operator 15561573Srgrimes == static void doemit(register struct parse *p, sop op, size_t opnd); 15571573Srgrimes * 15581573Srgrimes * It might seem better to implement this as a macro with a function as 15591573Srgrimes * hard-case backup, but it's just too big and messy unless there are 15601573Srgrimes * some changes to the data structures. Maybe later. 15611573Srgrimes */ 15621573Srgrimesstatic void 15631573Srgrimesdoemit(p, op, opnd) 15641573Srgrimesregister struct parse *p; 15651573Srgrimessop op; 15661573Srgrimessize_t opnd; 15671573Srgrimes{ 15681573Srgrimes /* avoid making error situations worse */ 15691573Srgrimes if (p->error != 0) 15701573Srgrimes return; 15711573Srgrimes 15721573Srgrimes /* deal with oversize operands ("can't happen", more or less) */ 15731573Srgrimes assert(opnd < 1<<OPSHIFT); 15741573Srgrimes 15751573Srgrimes /* deal with undersized strip */ 15761573Srgrimes if (p->slen >= p->ssize) 15771573Srgrimes enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 15781573Srgrimes assert(p->slen < p->ssize); 15791573Srgrimes 15801573Srgrimes /* finally, it's all reduced to the easy case */ 15811573Srgrimes p->strip[p->slen++] = SOP(op, opnd); 15821573Srgrimes} 15831573Srgrimes 15841573Srgrimes/* 15851573Srgrimes - doinsert - insert a sop into the strip 15861573Srgrimes == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 15871573Srgrimes */ 15881573Srgrimesstatic void 15891573Srgrimesdoinsert(p, op, opnd, pos) 15901573Srgrimesregister struct parse *p; 15911573Srgrimessop op; 15921573Srgrimessize_t opnd; 15931573Srgrimessopno pos; 15941573Srgrimes{ 15951573Srgrimes register sopno sn; 15961573Srgrimes register sop s; 15971573Srgrimes register int i; 15981573Srgrimes 15991573Srgrimes /* avoid making error situations worse */ 16001573Srgrimes if (p->error != 0) 16011573Srgrimes return; 16021573Srgrimes 16031573Srgrimes sn = HERE(); 16041573Srgrimes EMIT(op, opnd); /* do checks, ensure space */ 16051573Srgrimes assert(HERE() == sn+1); 16061573Srgrimes s = p->strip[sn]; 16071573Srgrimes 16081573Srgrimes /* adjust paren pointers */ 16091573Srgrimes assert(pos > 0); 16101573Srgrimes for (i = 1; i < NPAREN; i++) { 16111573Srgrimes if (p->pbegin[i] >= pos) { 16121573Srgrimes p->pbegin[i]++; 16131573Srgrimes } 16141573Srgrimes if (p->pend[i] >= pos) { 16151573Srgrimes p->pend[i]++; 16161573Srgrimes } 16171573Srgrimes } 16181573Srgrimes 16191573Srgrimes memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 16201573Srgrimes (HERE()-pos-1)*sizeof(sop)); 16211573Srgrimes p->strip[pos] = s; 16221573Srgrimes} 16231573Srgrimes 16241573Srgrimes/* 16251573Srgrimes - dofwd - complete a forward reference 16261573Srgrimes == static void dofwd(register struct parse *p, sopno pos, sop value); 16271573Srgrimes */ 16281573Srgrimesstatic void 16291573Srgrimesdofwd(p, pos, value) 16301573Srgrimesregister struct parse *p; 16311573Srgrimesregister sopno pos; 16321573Srgrimessop value; 16331573Srgrimes{ 16341573Srgrimes /* avoid making error situations worse */ 16351573Srgrimes if (p->error != 0) 16361573Srgrimes return; 16371573Srgrimes 16381573Srgrimes assert(value < 1<<OPSHIFT); 16391573Srgrimes p->strip[pos] = OP(p->strip[pos]) | value; 16401573Srgrimes} 16411573Srgrimes 16421573Srgrimes/* 16431573Srgrimes - enlarge - enlarge the strip 16441573Srgrimes == static void enlarge(register struct parse *p, sopno size); 16451573Srgrimes */ 16461573Srgrimesstatic void 16471573Srgrimesenlarge(p, size) 16481573Srgrimesregister struct parse *p; 16491573Srgrimesregister sopno size; 16501573Srgrimes{ 16511573Srgrimes register sop *sp; 16521573Srgrimes 16531573Srgrimes if (p->ssize >= size) 16541573Srgrimes return; 16551573Srgrimes 16561573Srgrimes sp = (sop *)realloc(p->strip, size*sizeof(sop)); 16571573Srgrimes if (sp == NULL) { 16581573Srgrimes SETERROR(REG_ESPACE); 16591573Srgrimes return; 16601573Srgrimes } 16611573Srgrimes p->strip = sp; 16621573Srgrimes p->ssize = size; 16631573Srgrimes} 16641573Srgrimes 16651573Srgrimes/* 16661573Srgrimes - stripsnug - compact the strip 16671573Srgrimes == static void stripsnug(register struct parse *p, register struct re_guts *g); 16681573Srgrimes */ 16691573Srgrimesstatic void 16701573Srgrimesstripsnug(p, g) 16711573Srgrimesregister struct parse *p; 16721573Srgrimesregister struct re_guts *g; 16731573Srgrimes{ 16741573Srgrimes g->nstates = p->slen; 16751573Srgrimes g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 16761573Srgrimes if (g->strip == NULL) { 16771573Srgrimes SETERROR(REG_ESPACE); 16781573Srgrimes g->strip = p->strip; 16791573Srgrimes } 16801573Srgrimes} 16811573Srgrimes 16821573Srgrimes/* 16831573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string 16841573Srgrimes == static void findmust(register struct parse *p, register struct re_guts *g); 16851573Srgrimes * 16861573Srgrimes * This algorithm could do fancy things like analyzing the operands of | 16871573Srgrimes * for common subsequences. Someday. This code is simple and finds most 16881573Srgrimes * of the interesting cases. 16891573Srgrimes * 16901573Srgrimes * Note that must and mlen got initialized during setup. 16911573Srgrimes */ 16921573Srgrimesstatic void 16931573Srgrimesfindmust(p, g) 16941573Srgrimesstruct parse *p; 16951573Srgrimesregister struct re_guts *g; 16961573Srgrimes{ 16971573Srgrimes register sop *scan; 16981573Srgrimes sop *start; 16991573Srgrimes register sop *newstart; 17001573Srgrimes register sopno newlen; 17011573Srgrimes register sop s; 17021573Srgrimes register char *cp; 17031573Srgrimes register sopno i; 17041573Srgrimes 17051573Srgrimes /* avoid making error situations worse */ 17061573Srgrimes if (p->error != 0) 17071573Srgrimes return; 17081573Srgrimes 17091573Srgrimes /* find the longest OCHAR sequence in strip */ 17101573Srgrimes newlen = 0; 17111573Srgrimes scan = g->strip + 1; 17121573Srgrimes do { 17131573Srgrimes s = *scan++; 17141573Srgrimes switch (OP(s)) { 17151573Srgrimes case OCHAR: /* sequence member */ 17161573Srgrimes if (newlen == 0) /* new sequence */ 17171573Srgrimes newstart = scan - 1; 17181573Srgrimes newlen++; 17191573Srgrimes break; 17201573Srgrimes case OPLUS_: /* things that don't break one */ 17211573Srgrimes case OLPAREN: 17221573Srgrimes case ORPAREN: 17231573Srgrimes break; 17241573Srgrimes case OQUEST_: /* things that must be skipped */ 17251573Srgrimes case OCH_: 17261573Srgrimes scan--; 17271573Srgrimes do { 17281573Srgrimes scan += OPND(s); 17291573Srgrimes s = *scan; 17301573Srgrimes /* assert() interferes w debug printouts */ 17311573Srgrimes if (OP(s) != O_QUEST && OP(s) != O_CH && 17321573Srgrimes OP(s) != OOR2) { 17331573Srgrimes g->iflags |= BAD; 17341573Srgrimes return; 17351573Srgrimes } 17361573Srgrimes } while (OP(s) != O_QUEST && OP(s) != O_CH); 17371573Srgrimes /* fallthrough */ 17381573Srgrimes default: /* things that break a sequence */ 17391573Srgrimes if (newlen > g->mlen) { /* ends one */ 17401573Srgrimes start = newstart; 17411573Srgrimes g->mlen = newlen; 17421573Srgrimes } 17431573Srgrimes newlen = 0; 17441573Srgrimes break; 17451573Srgrimes } 17461573Srgrimes } while (OP(s) != OEND); 17471573Srgrimes 17481573Srgrimes if (g->mlen == 0) /* there isn't one */ 17491573Srgrimes return; 17501573Srgrimes 17511573Srgrimes /* turn it into a character string */ 17521573Srgrimes g->must = malloc((size_t)g->mlen + 1); 17531573Srgrimes if (g->must == NULL) { /* argh; just forget it */ 17541573Srgrimes g->mlen = 0; 17551573Srgrimes return; 17561573Srgrimes } 17571573Srgrimes cp = g->must; 17581573Srgrimes scan = start; 17591573Srgrimes for (i = g->mlen; i > 0; i--) { 17601573Srgrimes while (OP(s = *scan++) != OCHAR) 17611573Srgrimes continue; 17621573Srgrimes assert(cp < g->must + g->mlen); 17631573Srgrimes *cp++ = (char)OPND(s); 17641573Srgrimes } 17651573Srgrimes assert(cp == g->must + g->mlen); 17661573Srgrimes *cp++ = '\0'; /* just on general principles */ 17671573Srgrimes} 17681573Srgrimes 17691573Srgrimes/* 17701573Srgrimes - pluscount - count + nesting 17711573Srgrimes == static sopno pluscount(register struct parse *p, register struct re_guts *g); 17721573Srgrimes */ 17731573Srgrimesstatic sopno /* nesting depth */ 17741573Srgrimespluscount(p, g) 17751573Srgrimesstruct parse *p; 17761573Srgrimesregister struct re_guts *g; 17771573Srgrimes{ 17781573Srgrimes register sop *scan; 17791573Srgrimes register sop s; 17801573Srgrimes register sopno plusnest = 0; 17811573Srgrimes register sopno maxnest = 0; 17821573Srgrimes 17831573Srgrimes if (p->error != 0) 17841573Srgrimes return(0); /* there may not be an OEND */ 17851573Srgrimes 17861573Srgrimes scan = g->strip + 1; 17871573Srgrimes do { 17881573Srgrimes s = *scan++; 17891573Srgrimes switch (OP(s)) { 17901573Srgrimes case OPLUS_: 17911573Srgrimes plusnest++; 17921573Srgrimes break; 17931573Srgrimes case O_PLUS: 17941573Srgrimes if (plusnest > maxnest) 17951573Srgrimes maxnest = plusnest; 17961573Srgrimes plusnest--; 17971573Srgrimes break; 17981573Srgrimes } 17991573Srgrimes } while (OP(s) != OEND); 18001573Srgrimes if (plusnest != 0) 18011573Srgrimes g->iflags |= BAD; 18021573Srgrimes return(maxnest); 18031573Srgrimes} 1804