regex2.h revision 165903
177218Sphk/*- 277218Sphk * Copyright (c) 1992, 1993, 1994 Henry Spencer. 377218Sphk * Copyright (c) 1992, 1993, 1994 477218Sphk * The Regents of the University of California. All rights reserved. 577218Sphk * 677218Sphk * This code is derived from software contributed to Berkeley by 777218Sphk * Henry Spencer. 877218Sphk * 977218Sphk * Redistribution and use in source and binary forms, with or without 1077218Sphk * modification, are permitted provided that the following conditions 1177218Sphk * are met: 1291454Sbrooks * 1. Redistributions of source code must retain the above copyright 1391454Sbrooks * notice, this list of conditions and the following disclaimer. 1477218Sphk * 2. Redistributions in binary form must reproduce the above copyright 1577218Sphk * notice, this list of conditions and the following disclaimer in the 1677218Sphk * documentation and/or other materials provided with the distribution. 1777218Sphk * 4. Neither the name of the University nor the names of its contributors 1877218Sphk * may be used to endorse or promote products derived from this software 1977218Sphk * without specific prior written permission. 2077218Sphk * 2177218Sphk * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2277218Sphk * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2377218Sphk * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2477218Sphk * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2577218Sphk * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2677218Sphk * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2777218Sphk * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2877218Sphk * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2977218Sphk * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3077218Sphk * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3177218Sphk * SUCH DAMAGE. 3277218Sphk * 3377218Sphk * @(#)regex2.h 8.4 (Berkeley) 3/20/94 3477218Sphk * $FreeBSD: head/lib/libc/regex/regex2.h 165903 2007-01-09 00:28:16Z imp $ 3577218Sphk */ 3677218Sphk 3777218Sphk/* 3877218Sphk * First, the stuff that ends up in the outside-world include file 3977218Sphk = typedef off_t regoff_t; 4077218Sphk = typedef struct { 4177218Sphk = int re_magic; 4277218Sphk = size_t re_nsub; // number of parenthesized subexpressions 4377218Sphk = const char *re_endp; // end pointer for REG_PEND 4477218Sphk = struct re_guts *re_g; // none of your business :-) 4577218Sphk = } regex_t; 4677218Sphk = typedef struct { 4777218Sphk = regoff_t rm_so; // start of match 4877218Sphk = regoff_t rm_eo; // end of match 4977218Sphk = } regmatch_t; 5077218Sphk */ 5177218Sphk/* 5277218Sphk * internals of regex_t 5377218Sphk */ 5477218Sphk#define MAGIC1 ((('r'^0200)<<8) | 'e') 5577218Sphk 5677218Sphk/* 5777218Sphk * The internal representation is a *strip*, a sequence of 5877218Sphk * operators ending with an endmarker. (Some terminology etc. is a 5977218Sphk * historical relic of earlier versions which used multiple strips.) 6077218Sphk * Certain oddities in the representation are there to permit running 6177218Sphk * the machinery backwards; in particular, any deviation from sequential 6277218Sphk * flow must be marked at both its source and its destination. Some 6377218Sphk * fine points: 6477218Sphk * 6577218Sphk * - OPLUS_ and O_PLUS are *inside* the loop they create. 6677218Sphk * - OQUEST_ and O_QUEST are *outside* the bypass they create. 6777218Sphk * - OCH_ and O_CH are *outside* the multi-way branch they create, while 6877218Sphk * OOR1 and OOR2 are respectively the end and the beginning of one of 6977218Sphk * the branches. Note that there is an implicit OOR2 following OCH_ 7077218Sphk * and an implicit OOR1 preceding O_CH. 7177218Sphk * 7277218Sphk * In state representations, an operator's bit is on to signify a state 7377218Sphk * immediately *preceding* "execution" of that operator. 7477218Sphk */ 7577218Sphktypedef unsigned long sop; /* strip operator */ 7677218Sphktypedef long sopno; 77138593Ssam#define OPRMASK 0xf8000000L 7877218Sphk#define OPDMASK 0x07ffffffL 79138593Ssam#define OPSHIFT ((unsigned)27) 80116957Ssam#define OP(n) ((n)&OPRMASK) 8177218Sphk#define OPND(n) ((n)&OPDMASK) 82187801Ssam#define SOP(op, opnd) ((op)|(opnd)) 8377218Sphk/* operators meaning operand */ 8477218Sphk/* (back, fwd are offsets) */ 8577218Sphk#define OEND (1L<<OPSHIFT) /* endmarker - */ 8677218Sphk#define OCHAR (2L<<OPSHIFT) /* character wide character */ 87146873Sjhb#define OBOL (3L<<OPSHIFT) /* left anchor - */ 8877218Sphk#define OEOL (4L<<OPSHIFT) /* right anchor - */ 8977218Sphk#define OANY (5L<<OPSHIFT) /* . - */ 9077218Sphk#define OANYOF (6L<<OPSHIFT) /* [...] set number */ 9177218Sphk#define OBACK_ (7L<<OPSHIFT) /* begin \d paren number */ 92155931Ssam#define O_BACK (8L<<OPSHIFT) /* end \d paren number */ 93173275Ssam#define OPLUS_ (9L<<OPSHIFT) /* + prefix fwd to suffix */ 9477218Sphk#define O_PLUS (10L<<OPSHIFT) /* + suffix back to prefix */ 9577218Sphk#define OQUEST_ (11L<<OPSHIFT) /* ? prefix fwd to suffix */ 96178354Ssam#define O_QUEST (12L<<OPSHIFT) /* ? suffix back to prefix */ 9777218Sphk#define OLPAREN (13L<<OPSHIFT) /* ( fwd to ) */ 98178354Ssam#define ORPAREN (14L<<OPSHIFT) /* ) back to ( */ 99178354Ssam#define OCH_ (15L<<OPSHIFT) /* begin choice fwd to OOR2 */ 100178354Ssam#define OOR1 (16L<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */ 101178354Ssam#define OOR2 (17L<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */ 102178354Ssam#define O_CH (18L<<OPSHIFT) /* end choice back to OOR1 */ 103178354Ssam#define OBOW (19L<<OPSHIFT) /* begin word - */ 104178354Ssam#define OEOW (20L<<OPSHIFT) /* end word - */ 105178354Ssam 106178354Ssam/* 107178354Ssam * Structures for [] character-set representation. 108178354Ssam */ 109178354Ssamtypedef struct { 110178354Ssam wint_t min; 111178354Ssam wint_t max; 112178354Ssam} crange; 113178354Ssamtypedef struct { 114178354Ssam unsigned char bmp[NC / 8]; 115178354Ssam wctype_t *types; 116183261Ssam int ntypes; 117183261Ssam wint_t *wides; 118183261Ssam int nwides; 119183261Ssam crange *ranges; 120183261Ssam int nranges; 121178354Ssam int invert; 122178354Ssam int icase; 123187801Ssam} cset; 124187801Ssam 125173275Ssamstatic int 126173275SsamCHIN1(cset *cs, wint_t ch) 127173275Ssam{ 128173275Ssam int i; 129173275Ssam 130173275Ssam assert(ch >= 0); 131173275Ssam if (ch < NC) 132173275Ssam return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ 133178354Ssam cs->invert); 134178354Ssam for (i = 0; i < cs->nwides; i++) 135178354Ssam if (ch == cs->wides[i]) 136173275Ssam return (!cs->invert); 137173275Ssam for (i = 0; i < cs->nranges; i++) 138178354Ssam if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max) 139173275Ssam return (!cs->invert); 140173275Ssam for (i = 0; i < cs->ntypes; i++) 141173275Ssam if (iswctype(ch, cs->types[i])) 14277218Sphk return (!cs->invert); 14377218Sphk return (cs->invert); 14477218Sphk} 145178354Ssam 146178354Ssamstatic __inline int 147178354SsamCHIN(cset *cs, wint_t ch) 148178354Ssam{ 149178354Ssam 15077218Sphk assert(ch >= 0); 151187801Ssam if (ch < NC) 152178354Ssam return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ 153178354Ssam cs->invert); 154178354Ssam else if (cs->icase) 155178354Ssam return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) || 156178354Ssam CHIN1(cs, towupper(ch))); 157178354Ssam else 158173275Ssam return (CHIN1(cs, ch)); 159173275Ssam} 160178354Ssam 161173275Ssam/* 162173275Ssam * main compiled-expression structure 163170531Ssam */ 164173275Ssamstruct re_guts { 165173275Ssam int magic; 166173275Ssam# define MAGIC2 ((('R'^0200)<<8)|'E') 167173275Ssam sop *strip; /* malloced area for strip */ 168173275Ssam int ncsets; /* number of csets in use */ 169173275Ssam cset *sets; /* -> cset [ncsets] */ 170173275Ssam int cflags; /* copy of regcomp() cflags argument */ 171173275Ssam sopno nstates; /* = number of sops */ 172173275Ssam sopno firststate; /* the initial OEND (normally 0) */ 173173275Ssam sopno laststate; /* the final OEND */ 174170531Ssam int iflags; /* internal flags */ 175170531Ssam# define USEBOL 01 /* used ^ */ 176170531Ssam# define USEEOL 02 /* used $ */ 177170531Ssam# define BAD 04 /* something wrong */ 178170531Ssam int nbol; /* number of ^ used */ 179170531Ssam int neol; /* number of $ used */ 180170531Ssam char *must; /* match must contain this string */ 181187801Ssam int moffset; /* latest point at which must may be located */ 182170531Ssam int *charjump; /* Boyer-Moore char jump table */ 183187801Ssam int *matchjump; /* Boyer-Moore match jump table */ 184187801Ssam int mlen; /* length of must */ 185187801Ssam size_t nsub; /* copy of re_nsub */ 186187801Ssam int backrefs; /* does it use back references? */ 187187801Ssam sopno nplus; /* how deep does it nest +s? */ 188187801Ssam}; 189170531Ssam 190173275Ssam/* misc utilities */ 191170531Ssam#define OUT (CHAR_MIN - 1) /* a non-character value */ 192170531Ssam#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_') 193178354Ssam