1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1985-2012 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                 Eclipse Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*          http://www.eclipse.org/org/documents/epl-v10.html           *
11*         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                   Phong Vo <kpv@research.att.com>                    *
20*                                                                      *
21***********************************************************************/
22#pragma prototyped
23
24/*
25 * posix regex implementation
26 *
27 * based on Doug McIlroy's C++ implementation
28 * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest
29 * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume
30 */
31
32#ifndef _REGLIB_H
33#define _REGLIB_H
34
35#define REG_VERSION_EXEC	20020509L
36#define REG_VERSION_MAP		20030916L	/* regdisc_t.re_map	*/
37
38#define re_info		env
39
40#define alloc		_reg_alloc
41#define classfun	_reg_classfun
42#define drop		_reg_drop
43#define fatal		_reg_fatal
44#define state		_reg_state
45
46typedef struct regsubop_s
47{
48	int		op;		/* REG_SUB_LOWER,REG_SUB_UPPER	*/
49	int		off;		/* re_rhs or match[] offset	*/
50	int		len;		/* re_rhs len or len==0 match[]	*/
51} regsubop_t;
52
53#define _REG_SUB_PRIVATE_ \
54	char*		re_cur;		/* re_buf cursor		*/ \
55	char*		re_end;		/* re_buf end			*/ \
56	regsubop_t*	re_ops;		/* rhs ops			*/ \
57	char		re_rhs[1];	/* substitution rhs		*/
58
59#include <ast.h>
60#include <cdt.h>
61#include <stk.h>
62
63#include "regex.h"
64
65#include <ctype.h>
66#include <errno.h>
67
68#if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG)
69#define _AST_REGEX_DEBUG	1
70#endif
71
72#define MBSIZE(p)	((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1)
73
74#undef	RE_DUP_MAX			/* posix puts this in limits.h!	*/
75#define RE_DUP_MAX	(INT_MAX/2-1)	/* 2*RE_DUP_MAX won't overflow	*/
76#define RE_DUP_INF	(RE_DUP_MAX+1)	/* infinity, for *		*/
77#define BACK_REF_MAX	9
78
79#define REG_COMP	(~REG_EXEC)
80#define REG_EXEC	(REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND)
81
82#define REX_NULL		0	/* null string (internal)	*/
83#define REX_ALT			1	/* a|b				*/
84#define REX_ALT_CATCH		2	/* REX_ALT catcher		*/
85#define REX_BACK		3	/* \1, \2, etc			*/
86#define REX_BEG			4	/* initial ^			*/
87#define REX_BEG_STR		5	/* initial ^ w/ no newline	*/
88#define REX_BM			6	/* Boyer-Moore			*/
89#define REX_CAT			7	/* catenation catcher		*/
90#define REX_CLASS		8	/* [...]			*/
91#define REX_COLL_CLASS		9	/* collation order [...]	*/
92#define REX_CONJ		10	/* a&b				*/
93#define REX_CONJ_LEFT		11	/* REX_CONJ left catcher	*/
94#define REX_CONJ_RIGHT		12	/* REX_CONJ right catcher	*/
95#define REX_DONE		13	/* completed match (internal)	*/
96#define REX_DOT			14	/* .				*/
97#define REX_END			15	/* final $			*/
98#define REX_END_STR		16	/* final $ before tail newline	*/
99#define REX_EXEC		17	/* call re.re_exec()		*/
100#define REX_FIN_STR		18	/* final $ w/ no newline	*/
101#define REX_GROUP		19	/* \(...\)			*/
102#define REX_GROUP_CATCH		20	/* REX_GROUP catcher		*/
103#define REX_GROUP_AHEAD		21	/* 0-width lookahead		*/
104#define REX_GROUP_AHEAD_CATCH	22	/* REX_GROUP_AHEAD catcher	*/
105#define REX_GROUP_AHEAD_NOT	23	/* inverted 0-width lookahead	*/
106#define REX_GROUP_BEHIND	24	/* 0-width lookbehind		*/
107#define REX_GROUP_BEHIND_CATCH	25	/* REX_GROUP_BEHIND catcher	*/
108#define REX_GROUP_BEHIND_NOT	26	/* inverted 0-width lookbehind	*/
109#define REX_GROUP_BEHIND_NOT_CATCH 27	/* REX_GROUP_BEHIND_NOT catcher	*/
110#define REX_GROUP_COND		28	/* conditional group		*/
111#define REX_GROUP_COND_CATCH	29	/* conditional group catcher	*/
112#define REX_GROUP_CUT		30	/* don't backtrack over this	*/
113#define REX_GROUP_CUT_CATCH	31	/* REX_GROUP_CUT catcher	*/
114#define REX_KMP			32	/* Knuth-Morris-Pratt		*/
115#define REX_NEG			33	/* negation			*/
116#define REX_NEG_CATCH		34	/* REX_NEG catcher		*/
117#define REX_NEST		35	/* nested match			*/
118#define REX_ONECHAR		36	/* a single-character literal	*/
119#define REX_REP			37	/* Kleene closure		*/
120#define REX_REP_CATCH		38	/* REX_REP catcher		*/
121#define REX_STRING		39	/* some chars			*/
122#define REX_TRIE		40	/* alternation of strings	*/
123#define REX_WBEG		41	/* \<				*/
124#define REX_WEND		42	/* \>				*/
125#define REX_WORD		43	/* word boundary		*/
126#define REX_WORD_NOT		44	/* not word boundary		*/
127
128#define T_META		((int)UCHAR_MAX+1)
129#define T_STAR		(T_META+0)
130#define T_PLUS		(T_META+1)
131#define T_QUES		(T_META+2)
132#define T_BANG		(T_META+3)
133#define T_AT		(T_META+4)
134#define T_TILDE		(T_META+5)
135#define T_PERCENT	(T_META+6)
136#define T_LEFT		(T_META+7)
137#define T_OPEN		(T_META+8)
138#define T_CLOSE		(T_OPEN+1)
139#define T_RIGHT		(T_OPEN+2)
140#define T_CFLX		(T_OPEN+3)
141#define T_DOT		(T_OPEN+4)
142#define T_DOTSTAR	(T_OPEN+5)
143#define T_END		(T_OPEN+6)
144#define T_BAD		(T_OPEN+7)
145#define T_DOLL		(T_OPEN+8)
146#define T_BRA		(T_OPEN+9)
147#define T_BAR		(T_OPEN+10)
148#define T_AND		(T_OPEN+11)
149#define T_LT		(T_OPEN+12)
150#define T_GT		(T_OPEN+13)
151#define T_SLASHPLUS	(T_OPEN+14)
152#define T_GROUP		(T_OPEN+15)
153#define T_WORD		(T_OPEN+16)
154#define T_WORD_NOT	(T_WORD+1)
155#define T_BEG_STR	(T_WORD+2)
156#define T_END_STR	(T_WORD+3)
157#define T_FIN_STR	(T_WORD+4)
158#define T_ESCAPE	(T_WORD+5)
159#define T_ALNUM		(T_WORD+6)
160#define T_ALNUM_NOT	(T_ALNUM+1)
161#define T_DIGIT		(T_ALNUM+2)
162#define T_DIGIT_NOT	(T_ALNUM+3)
163#define T_SPACE		(T_ALNUM+4)
164#define T_SPACE_NOT	(T_ALNUM+5)
165#define T_BACK		(T_ALNUM+6)
166
167#define BRE		0
168#define ERE		3
169#define ARE		6
170#define SRE		9
171#define KRE		12
172
173#define HIT		SSIZE_MAX
174
175#define bitclr(p,c)	((p)[((c)>>3)&037]&=(~(1<<((c)&07))))
176#define bitset(p,c)	((p)[((c)>>3)&037]|=(1<<((c)&07)))
177#define bittst(p,c)	((p)[((c)>>3)&037]&(1<<((c)&07)))
178
179#define setadd(p,c)	bitset((p)->bits,c)
180#define setclr(p,c)	bitclr((p)->bits,c)
181#define settst(p,c)	bittst((p)->bits,c)
182
183#if _hdr_wchar && _lib_wctype && _lib_iswctype
184
185#include <stdio.h> /* because <wchar.h> includes it and we generate it */
186#include <wchar.h>
187#if _hdr_wctype
188#include <wctype.h>
189#endif
190
191#if !defined(iswblank) && !_lib_iswblank
192#define _need_iswblank	1
193#define iswblank(x)	_reg_iswblank(x)
194extern int		_reg_iswblank(wint_t);
195#endif
196
197#if !defined(towupper) && !_lib_towupper
198#define towupper(x)	toupper(x)
199#endif
200
201#if !defined(towlower) && !_lib_towlower
202#define towlower(x)	tolower(x)
203#endif
204
205#else
206
207#undef	_lib_wctype
208
209#ifndef iswalnum
210#define iswalnum(x)	isalnum(x)
211#endif
212#ifndef iswalpha
213#define iswalpha(x)	isalpha(x)
214#endif
215#ifndef iswcntrl
216#define iswcntrl(x)	iscntrl(x)
217#endif
218#ifndef iswdigit
219#define iswdigit(x)	isdigit(x)
220#endif
221#ifndef iswgraph
222#define iswgraph(x)	isgraph(x)
223#endif
224#ifndef iswlower
225#define iswlower(x)	islower(x)
226#endif
227#ifndef iswprint
228#define iswprint(x)	isprint(x)
229#endif
230#ifndef iswpunct
231#define iswpunct(x)	ispunct(x)
232#endif
233#ifndef iswspace
234#define iswspace(x)	isspace(x)
235#endif
236#ifndef iswupper
237#define iswupper(x)	isupper(x)
238#endif
239#ifndef iswxdigit
240#define iswxdigit(x)	isxdigit(x)
241#endif
242
243#ifndef towlower
244#define towlower(x)	tolower(x)
245#endif
246#ifndef towupper
247#define towupper(x)	toupper(x)
248#endif
249
250#endif
251
252#ifndef	iswblank
253#define	iswblank(x)	((x)==' '||(x)=='\t')
254#endif
255
256#ifndef iswgraph
257#define	iswgraph(x)	(iswprint(x)&&!iswblank(x))
258#endif
259
260#define isword(x)	(isalnum(x)||(x)=='_')
261
262/*
263 * collation element support
264 */
265
266#define COLL_KEY_MAX	32
267
268#if COLL_KEY_MAX < MB_LEN_MAX
269#undef	COLL_KEY_MAX
270#define COLL_KEY_MAX	MB_LEN_MAX
271#endif
272
273typedef unsigned char Ckey_t[COLL_KEY_MAX+1];
274
275#define COLL_end	0
276#define COLL_call	1
277#define COLL_char	2
278#define COLL_range	3
279#define COLL_range_lc	4
280#define COLL_range_uc	5
281
282typedef struct Celt_s
283{
284	short		typ;
285	short		min;
286	short		max;
287	regclass_t	fun;
288	Ckey_t		beg;
289	Ckey_t		end;
290} Celt_t;
291
292/*
293 * private stuff hanging off regex_t
294 */
295
296typedef struct Stk_pos_s
297{
298	off_t		offset;
299	char*		base;
300} Stk_pos_t;
301
302typedef struct Vector_s
303{
304	Stk_t*		stk;		/* stack pointer		*/
305	char*		vec;		/* the data			*/
306	int		inc;		/* growth increment		*/
307	int		siz;		/* element size			*/
308	ssize_t		max;		/* max index			*/
309	ssize_t		cur;		/* current index -- user domain	*/
310} Vector_t;
311
312/*
313 * Rex_t subtypes
314 */
315
316typedef struct Cond_s
317{
318	unsigned char*	beg;		/* beginning of next match	*/
319	struct Rex_s*	next[2];	/* 0:no 1:yes next pattern	*/
320	struct Rex_s*	cont;		/* right catcher		*/
321	int		yes;		/* yes condition hit		*/
322} Cond_t;
323
324typedef struct Conj_left_s
325{
326	unsigned char*	beg;		/* beginning of left match	*/
327	struct Rex_s*	right;		/* right pattern		*/
328	struct Rex_s*	cont;		/* right catcher		*/
329} Conj_left_t;
330
331typedef struct Conj_right_s
332{
333	unsigned char*	end;		/* end of left match		*/
334	struct Rex_s*	cont;		/* ambient continuation		*/
335} Conj_right_t;
336
337typedef unsigned int Bm_mask_t;
338
339typedef struct Bm_s
340{
341	Bm_mask_t**	mask;
342	size_t*		skip;
343	size_t*		fail;
344	size_t		size;
345	ssize_t		back;
346	ssize_t		left;
347	ssize_t		right;
348	size_t		complete;
349} Bm_t;
350
351typedef struct String_s
352{
353	int*		fail;
354	unsigned char*	base;
355	size_t		size;
356} String_t;
357
358typedef struct Set_s
359{
360	unsigned char	bits[(UCHAR_MAX+1)/CHAR_BIT];
361} Set_t;
362
363typedef struct Collate_s
364{
365	int		invert;
366	Celt_t*		elements;
367} Collate_t;
368
369typedef struct Binary_s
370{
371	struct Rex_s*	left;
372	struct Rex_s*	right;
373	int		serial;
374} Binary_t;
375
376typedef struct Group_s
377{
378	int		number;		/* group number			*/
379	int		last;		/* last contained group number	*/
380	int		size;		/* lookbehind size		*/
381	int		back;		/* backreferenced		*/
382	regflags_t	flags;		/* group flags			*/
383	union
384	{
385	Binary_t	binary;
386	struct Rex_s*	rex;
387	}		expr;
388} Group_t;
389
390typedef struct Exec_s
391{
392	void*		data;
393	const char*	text;
394	size_t		size;
395} Exec_t;
396
397#define REX_NEST_open		0x01
398#define REX_NEST_close		0x02
399#define REX_NEST_escape		0x04
400#define REX_NEST_quote		0x08
401#define REX_NEST_literal	0x10
402#define REX_NEST_delimiter	0x20
403#define REX_NEST_terminator	0x40
404#define REX_NEST_separator	0x80
405
406#define REX_NEST_SHIFT		8
407
408typedef struct Nest_s
409{
410	int		primary;
411	unsigned short	none;		/* for Nest_t.type[-1] */
412	unsigned short	type[1];
413} Nest_t;
414
415/*
416 * REX_ALT catcher, solely to get control at the end of an
417 * alternative to keep records for comparing matches.
418 */
419
420typedef struct Alt_catch_s
421{
422	struct Rex_s*	cont;
423} Alt_catch_t;
424
425typedef struct Group_catch_s
426{
427	struct Rex_s*	cont;
428	regoff_t*	eo;
429} Group_catch_t;
430
431typedef struct Behind_catch_s
432{
433	struct Rex_s*	cont;
434	unsigned char*	beg;
435	unsigned char*	end;
436} Behind_catch_t;
437
438/*
439 * REX_NEG catcher determines what string lengths can be matched,
440 * then Neg investigates continuations of other lengths.
441 * This is inefficient.  For !POSITIONS expressions, we can do better:
442 * since matches to rex will be enumerated in decreasing order,
443 * we can investigate continuations whenever a length is skipped.
444 */
445
446typedef struct Neg_catch_s
447{
448	unsigned char*	beg;
449	unsigned char*	index;
450} Neg_catch_t;
451
452/*
453 * REX_REP catcher.  One is created on the stack for
454 * each iteration of a complex repetition.
455 */
456
457typedef struct Rep_catch_s
458{
459	struct Rex_s*	cont;
460	struct Rex_s*	ref;
461	unsigned char*	beg;
462	int		n;
463} Rep_catch_t;
464
465/*
466 * data structure for an alternation of pure strings
467 * son points to a subtree of all strings with a common
468 * prefix ending in character c.  sib links alternate
469 * letters in the same position of a word.  end=1 if
470 * some word ends with c.  the order of strings is
471 * irrelevant, except long words must be investigated
472 * before short ones.
473 */
474
475typedef struct Trie_node_s
476{
477	unsigned char		c;
478	unsigned char		end;
479	struct Trie_node_s*	son;
480	struct Trie_node_s*	sib;
481} Trie_node_t;
482
483typedef struct Trie_s
484{
485	Trie_node_t**	root;
486	int		min;
487	int		max;
488} Trie_t;
489
490/*
491 * Rex_t is a node in a regular expression
492 */
493
494typedef struct Rex_s
495{
496	unsigned char	type;			/* node type		*/
497	unsigned char	marked;			/* already marked	*/
498	short		serial;			/* subpattern number	*/
499	regflags_t	flags;			/* scoped flags		*/
500	int		explicit;		/* scoped explicit match*/
501	struct Rex_s*	next;			/* remaining parts	*/
502	int		lo;			/* lo dup count		*/
503	int		hi;			/* hi dup count		*/
504	unsigned char*	map;			/* fold and/or ccode map*/
505	union
506	{
507	Alt_catch_t	alt_catch;		/* alt catcher		*/
508	Bm_t		bm;			/* bm			*/
509	Behind_catch_t	behind_catch;		/* behind catcher	*/
510	Set_t*		charclass;		/* char class		*/
511	Collate_t	collate;		/* collation class	*/
512	Cond_t		cond_catch;		/* cond catcher		*/
513	Conj_left_t	conj_left;		/* conj left catcher	*/
514	Conj_right_t	conj_right;		/* conj right catcher	*/
515	void*		data;			/* data after Rex_t	*/
516	Exec_t		exec;			/* re.re_exec() args	*/
517	Group_t		group;			/* a|b or rep		*/
518	Group_catch_t	group_catch;		/* group catcher	*/
519	Neg_catch_t	neg_catch;		/* neg catcher		*/
520	Nest_t		nest;			/* nested match		*/
521	unsigned char	onechar;		/* single char		*/
522	Rep_catch_t	rep_catch;		/* rep catcher		*/
523	String_t	string;			/* string/kmp		*/
524	Trie_t		trie;			/* trie			*/
525	}		re;
526} Rex_t;
527
528typedef struct reglib_s			/* library private regex_t info	*/
529{
530	struct Rex_s*	rex;		/* compiled expression		*/
531	regdisc_t*	disc;		/* REG_DISCIPLINE discipline	*/
532	const regex_t*	regex;		/* from regexec			*/
533	unsigned char*	beg;		/* beginning of string		*/
534	unsigned char*	end;		/* end of string		*/
535	Vector_t*	pos;		/* posns of certain subpatterns	*/
536	Vector_t*	bestpos;	/* ditto for best match		*/
537	regmatch_t*	match;		/* subexrs in current match 	*/
538	regmatch_t*	best;		/* ditto in best match yet	*/
539	Stk_pos_t	stk;		/* exec stack pos		*/
540	size_t		min;		/* minimum match length		*/
541	size_t		nsub;		/* internal re_nsub		*/
542	regflags_t	flags;		/* flags from regcomp()		*/
543	int		error;		/* last error			*/
544	int		explicit;	/* explicit match on this char	*/
545	int		leading;	/* leading match on this char	*/
546	int		refs;		/* regcomp()+regdup() references*/
547	Rex_t		done;		/* the last continuation	*/
548	regstat_t	stats;		/* for regstat()		*/
549	unsigned char	fold[UCHAR_MAX+1]; /* REG_ICASE map		*/
550	unsigned char	hard;		/* hard comp			*/
551	unsigned char	once;		/* if 1st parse fails, quit	*/
552	unsigned char	separate;	/* cannot combine		*/
553	unsigned char	stack;		/* hard comp or exec		*/
554	unsigned char	sub;		/* re_sub is valid		*/
555	unsigned char	test;		/* debug/test bitmask		*/
556} Env_t;
557
558typedef struct oldregmatch_s		/* pre-20120528 regmatch_t	*/
559{
560	int		rm_so;		/* offset of start		*/
561	int		rm_eo;		/* offset of end		*/
562} oldregmatch_t;
563
564typedef struct State_s				/* shared state		*/
565{
566	regmatch_t	nomatch;
567	struct
568	{
569	unsigned char	key;
570	short		val[15];
571	}		escape[52];
572	short*		magic[UCHAR_MAX+1];
573	regdisc_t	disc;
574	int		fatal;
575	int		initialized;
576	Dt_t*		attrs;
577	Dt_t*		names;
578	Dtdisc_t	dtdisc;
579} State_t;
580
581extern State_t		state;
582
583extern void*		alloc(regdisc_t*, void*, size_t);
584extern regclass_t	classfun(int);
585extern void		drop(regdisc_t*, Rex_t*);
586extern int		fatal(regdisc_t*, int, const char*);
587
588#endif
589