1273459Strasz/*-
2332595Strasz * This code is derived from OpenBSD's libc/regex, original license follows:
3332595Strasz *
4273459Strasz * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5273459Strasz * Copyright (c) 1992, 1993, 1994
6273459Strasz *	The Regents of the University of California.  All rights reserved.
7273459Strasz *
8273459Strasz * This code is derived from software contributed to Berkeley by
9273459Strasz * Henry Spencer.
10273459Strasz *
11273459Strasz * Redistribution and use in source and binary forms, with or without
12273459Strasz * modification, are permitted provided that the following conditions
13273459Strasz * are met:
14273459Strasz * 1. Redistributions of source code must retain the above copyright
15273459Strasz *    notice, this list of conditions and the following disclaimer.
16273459Strasz * 2. Redistributions in binary form must reproduce the above copyright
17273459Strasz *    notice, this list of conditions and the following disclaimer in the
18273459Strasz *    documentation and/or other materials provided with the distribution.
19273459Strasz * 3. Neither the name of the University nor the names of its contributors
20273459Strasz *    may be used to endorse or promote products derived from this software
21273459Strasz *    without specific prior written permission.
22273459Strasz *
23273459Strasz * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24273459Strasz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25273459Strasz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26273459Strasz * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27273459Strasz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28273459Strasz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29273459Strasz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30273459Strasz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31273459Strasz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32273459Strasz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33273459Strasz * SUCH DAMAGE.
34273459Strasz *
35273459Strasz *	@(#)regex2.h	8.4 (Berkeley) 3/20/94
36273459Strasz */
37285086Strasz
38273459Strasz/*
39274328Smav * internals of regex_t
40274328Smav */
41285086Strasz#define	MAGIC1	((('r'^0200)<<8) | 'e')
42273459Strasz
43273459Strasz/*
44273459Strasz * The internal representation is a *strip*, a sequence of
45273459Strasz * operators ending with an endmarker.  (Some terminology etc. is a
46273459Strasz * historical relic of earlier versions which used multiple strips.)
47273459Strasz * Certain oddities in the representation are there to permit running
48273459Strasz * the machinery backwards; in particular, any deviation from sequential
49273459Strasz * flow must be marked at both its source and its destination.  Some
50273459Strasz * fine points:
51273459Strasz *
52285086Strasz * - OPLUS_ and O_PLUS are *inside* the loop they create.
53273459Strasz * - OQUEST_ and O_QUEST are *outside* the bypass they create.
54285086Strasz * - OCH_ and O_CH are *outside* the multi-way branch they create, while
55285086Strasz *   OOR1 and OOR2 are respectively the end and the beginning of one of
56285086Strasz *   the branches.  Note that there is an implicit OOR2 following OCH_
57285086Strasz *   and an implicit OOR1 preceding O_CH.
58285086Strasz *
59273459Strasz * In state representations, an operator's bit is on to signify a state
60273459Strasz * immediately *preceding* "execution" of that operator.
61273459Strasz */
62273459Strasztypedef unsigned long sop;	/* strip operator */
63273459Strasztypedef long sopno;
64273459Strasz#define	OPRMASK	0xf8000000LU
65273459Strasz#define	OPDMASK	0x07ffffffLU
66273459Strasz#define	OPSHIFT	((unsigned)27)
67273459Strasz#define	OP(n)	((n)&OPRMASK)
68273459Strasz#define	OPND(n)	((n)&OPDMASK)
69273459Strasz#define	SOP(op, opnd)	((op)|(opnd))
70273459Strasz/* operators			   meaning	operand			*/
71273459Strasz/*						(back, fwd are offsets)	*/
72273459Strasz#define	OEND	(1LU<<OPSHIFT)	/* endmarker	-			*/
73273459Strasz#define	OCHAR	(2LU<<OPSHIFT)	/* character	unsigned char		*/
74273459Strasz#define	OBOL	(3LU<<OPSHIFT)	/* left anchor	-			*/
75273459Strasz#define	OEOL	(4LU<<OPSHIFT)	/* right anchor	-			*/
76273459Strasz#define	OANY	(5LU<<OPSHIFT)	/* .		-			*/
77273459Strasz#define	OANYOF	(6LU<<OPSHIFT)	/* [...]	set number		*/
78273459Strasz#define	OBACK_	(7LU<<OPSHIFT)	/* begin \d	paren number		*/
79273459Strasz#define	O_BACK	(8LU<<OPSHIFT)	/* end \d	paren number		*/
80273459Strasz#define	OPLUS_	(9LU<<OPSHIFT)	/* + prefix	fwd to suffix		*/
81273459Strasz#define	O_PLUS	(10LU<<OPSHIFT)	/* + suffix	back to prefix		*/
82273459Strasz#define	OQUEST_	(11LU<<OPSHIFT)	/* ? prefix	fwd to suffix		*/
83273459Strasz#define	O_QUEST	(12LU<<OPSHIFT)	/* ? suffix	back to prefix		*/
84273459Strasz#define	OLPAREN	(13LU<<OPSHIFT)	/* (		fwd to )		*/
85273459Strasz#define	ORPAREN	(14LU<<OPSHIFT)	/* )		back to (		*/
86273459Strasz#define	OCH_	(15LU<<OPSHIFT)	/* begin choice	fwd to OOR2		*/
87273459Strasz#define	OOR1	(16LU<<OPSHIFT)	/* | pt. 1	back to OOR1 or OCH_	*/
88273459Strasz#define	OOR2	(17LU<<OPSHIFT)	/* | pt. 2	fwd to OOR2 or O_CH	*/
89273459Strasz#define	O_CH	(18LU<<OPSHIFT)	/* end choice	back to OOR1		*/
90273459Strasz#define	OBOW	(19LU<<OPSHIFT)	/* begin word	-			*/
91273459Strasz#define	OEOW	(20LU<<OPSHIFT)	/* end word	-			*/
92273459Strasz
93273459Strasz/*
94273459Strasz * Structure for [] character-set representation.  Character sets are
95273459Strasz * done as bit vectors, grouped 8 to a byte vector for compactness.
96273459Strasz * The individual set therefore has both a pointer to the byte vector
97273459Strasz * and a mask to pick out the relevant bit of each byte.  A hash code
98273459Strasz * simplifies testing whether two sets could be identical.
99273459Strasz *
100273459Strasz * This will get trickier for multicharacter collating elements.  As
101273459Strasz * preliminary hooks for dealing with such things, we also carry along
102273459Strasz * a string of multi-character elements, and decide the size of the
103273459Strasz * vectors at run time.
104273459Strasz */
105273459Strasztypedef struct {
106273459Strasz	uch *ptr;		/* -> uch [csetsize] */
107273459Strasz	uch mask;		/* bit within array */
108274328Smav	uch hash;		/* hash code */
109274328Smav	size_t smultis;
110274328Smav	char *multis;		/* -> char[smulti]  ab\0cd\0ef\0\0 */
111274328Smav} cset;
112274328Smav/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
113274328Smav#define	CHadd(cs, c)	((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
114274328Smav#define	CHsub(cs, c)	((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
115274328Smav#define	CHIN(cs, c)	((cs)->ptr[(uch)(c)] & (cs)->mask)
116274328Smav#define	MCadd(p, cs, cp)	mcadd(p, cs, cp)	/* llvm_regcomp() internal fns */
117274328Smav#define	MCsub(p, cs, cp)	mcsub(p, cs, cp)
118274328Smav#define	MCin(p, cs, cp)	mcin(p, cs, cp)
119274328Smav
120274328Smav/* stuff for character categories */
121274328Smavtypedef unsigned char cat_t;
122274328Smav
123274328Smav/*
124274328Smav * main compiled-expression structure
125274328Smav */
126274328Smavstruct re_guts {
127274328Smav	int magic;
128274328Smav#		define	MAGIC2	((('R'^0200)<<8)|'E')
129274328Smav	sop *strip;		/* malloced area for strip */
130274328Smav	int csetsize;		/* number of bits in a cset vector */
131273459Strasz	int ncsets;		/* number of csets in use */
132273459Strasz	cset *sets;		/* -> cset [ncsets] */
133273459Strasz	uch *setbits;		/* -> uch[csetsize][ncsets/CHAR_BIT] */
134273459Strasz	int cflags;		/* copy of llvm_regcomp() cflags argument */
135273459Strasz	sopno nstates;		/* = number of sops */
136273459Strasz	sopno firststate;	/* the initial OEND (normally 0) */
137273459Strasz	sopno laststate;	/* the final OEND */
138273459Strasz	int iflags;		/* internal flags */
139273459Strasz#		define	USEBOL	01	/* used ^ */
140273459Strasz#		define	USEEOL	02	/* used $ */
141273459Strasz#		define	REGEX_BAD	04	/* something wrong */
142274328Smav	int nbol;		/* number of ^ used */
143274328Smav	int neol;		/* number of $ used */
144274328Smav	int ncategories;	/* how many character categories */
145273459Strasz	cat_t *categories;	/* ->catspace[-CHAR_MIN] */
146274328Smav	char *must;		/* match must contain this string */
147274328Smav	int mlen;		/* length of must */
148273459Strasz	size_t nsub;		/* copy of re_nsub */
149273459Strasz	int backrefs;		/* does it use back references? */
150273459Strasz	sopno nplus;		/* how deep does it nest +s? */
151273459Strasz	/* catspace must be last */
152273459Strasz	cat_t catspace[1];	/* actually [NC] */
153273459Strasz};
154273459Strasz
155273459Strasz/* misc utilities */
156273459Strasz#define	OUT	(CHAR_MAX+1)	/* a non-character value */
157273459Strasz#define	ISWORD(c)	(isalnum(c&0xff) || (c) == '_')
158273459Strasz