1/*
2 * re.c - compile regular expressions.
3 */
4
5/*
6 * Copyright (C) 1991-2003 the Free Software Foundation, Inc.
7 *
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
10 *
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
24 */
25
26#include "awk.h"
27
28static reg_syntax_t syn;
29
30/* make_regexp --- generate compiled regular expressions */
31
32Regexp *
33make_regexp(const char *s, size_t len, int ignorecase)
34{
35	Regexp *rp;
36	const char *rerr;
37	const char *src = s;
38	char *temp;
39	const char *end = s + len;
40	register char *dest;
41	register int c, c2;
42#ifdef MBS_SUPPORT
43	/* The number of bytes in the current multbyte character.
44	   It is 0, when the current character is a singlebyte character.  */
45	size_t is_multibyte = 0;
46	mbstate_t mbs;
47
48	if (gawk_mb_cur_max > 1)
49		memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
50#endif
51
52	/* Handle escaped characters first. */
53
54	/*
55	 * Build a copy of the string (in dest) with the
56	 * escaped characters translated, and generate the regex
57	 * from that.
58	 */
59	emalloc(dest, char *, len + 2, "make_regexp");
60	temp = dest;
61
62	while (src < end) {
63#ifdef MBS_SUPPORT
64		if (gawk_mb_cur_max > 1 && !is_multibyte) {
65			/* The previous byte is a singlebyte character, or last byte
66			   of a multibyte character.  We check the next character.  */
67			is_multibyte = mbrlen(src, end - src, &mbs);
68			if ((is_multibyte == 1) || (is_multibyte == (size_t) -1)
69				|| (is_multibyte == (size_t) -2 || (is_multibyte == 0))) {
70				/* We treat it as a singlebyte character.  */
71				is_multibyte = 0;
72			}
73		}
74#endif
75
76		if (
77#ifdef MBS_SUPPORT
78		/* We skip multibyte character, since it must not be a special
79		   character.  */
80		    (gawk_mb_cur_max == 1 || ! is_multibyte) &&
81#endif
82		    (*src == '\\')) {
83			c = *++src;
84			switch (c) {
85			case 'a':
86			case 'b':
87			case 'f':
88			case 'n':
89			case 'r':
90			case 't':
91			case 'v':
92			case 'x':
93			case '0':
94			case '1':
95			case '2':
96			case '3':
97			case '4':
98			case '5':
99			case '6':
100			case '7':
101				c2 = parse_escape(&src);
102				if (c2 < 0)
103					cant_happen();
104				/*
105				 * Unix awk treats octal (and hex?) chars
106				 * literally in re's, so escape regexp
107				 * metacharacters.
108				 */
109				if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x')
110				    && strchr("()|*+?.^$\\[]", c2) != NULL)
111					*dest++ = '\\';
112				*dest++ = (char) c2;
113				break;
114			case '8':
115			case '9':	/* a\9b not valid */
116				*dest++ = c;
117				src++;
118				break;
119			case 'y':	/* normally \b */
120				/* gnu regex op */
121				if (! do_traditional) {
122					*dest++ = '\\';
123					*dest++ = 'b';
124					src++;
125					break;
126				}
127				/* else, fall through */
128			default:
129				*dest++ = '\\';
130				*dest++ = (char) c;
131				src++;
132				break;
133			} /* switch */
134		} else
135			*dest++ = *src++;	/* not '\\' */
136#ifdef MBS_SUPPORT
137		if (gawk_mb_cur_max > 1 && is_multibyte)
138			is_multibyte--;
139#endif
140	} /* while */
141
142	*dest = '\0' ;	/* Only necessary if we print dest ? */
143	emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
144	memset((char *) rp, 0, sizeof(*rp));
145	rp->pat.allocated = 0;	/* regex will allocate the buffer */
146	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
147
148	if (ignorecase)
149		rp->pat.translate = casetable;
150	else
151		rp->pat.translate = NULL;
152	len = dest - temp;
153	if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
154		fatal("%s: /%s/", rerr, temp);	/* rerr already gettextized inside regex routines */
155
156	/* gack. this must be done *after* re_compile_pattern */
157	rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
158
159	free(temp);
160	return rp;
161}
162
163/* research --- do a regexp search */
164
165int
166research(Regexp *rp, register const char *str, int start,
167	register size_t len, int need_start)
168{
169	const char *ret = str;
170
171	if (ret) {
172		/*
173		 * Passing NULL as last arg speeds up search for cases
174		 * where we don't need the start/end info.
175		 */
176		int res = re_search(&(rp->pat), str, start+len,
177				start, len, need_start ? &(rp->regs) : NULL);
178
179		/*
180		 * A return of -2 indicates that a heuristic in
181		 * regex decided it might allocate too much memory
182		 * on the C stack. This doesn't apply to gawk, which
183		 * uses REGEX_MALLOC. This is dealt with by the
184		 * assignment to re_max_failures in resetup().
185		 * Naetheless, we keep this code here as a fallback.
186		 *
187		 * XXX: The above comment is obsolete; the new regex
188		 * doesn't have an re_max_failures variable. But we
189		 * keep the code here just in case.
190		 */
191		if (res == -2) {
192			/* the 10 here is arbitrary */
193			fatal(_("regex match failed, not enough memory to match string \"%.*s%s\""),
194					(int) (len > 10 ? 10 : len), str + start,
195					len > 10 ? "..." : "");
196		}
197		return res;
198	} else
199		return -1;
200}
201
202/* refree --- free up the dynamic memory used by a compiled regexp */
203
204void
205refree(Regexp *rp)
206{
207	/*
208	 * This isn't malloced, don't let regfree free it.
209	 * (This is strictly necessary only for the old
210	 * version of regex, but it's a good idea to keep it
211	 * here in case regex internals change in the future.)
212	 */
213	rp->pat.translate = NULL;
214
215	regfree(& rp->pat);
216	if (rp->regs.start)
217		free(rp->regs.start);
218	if (rp->regs.end)
219		free(rp->regs.end);
220	free(rp);
221}
222
223/* re_update --- recompile a dynamic regexp */
224
225Regexp *
226re_update(NODE *t)
227{
228	NODE *t1;
229
230	if ((t->re_flags & CASE) == IGNORECASE) {
231		if ((t->re_flags & CONST) != 0) {
232			assert(t->type == Node_regex);
233			return t->re_reg;
234		}
235		t1 = force_string(tree_eval(t->re_exp));
236		if (t->re_text != NULL) {
237			if (cmp_nodes(t->re_text, t1) == 0) {
238				free_temp(t1);
239				return t->re_reg;
240			}
241			unref(t->re_text);
242		}
243		t->re_text = dupnode(t1);
244		free_temp(t1);
245	}
246	if (t->re_reg != NULL)
247		refree(t->re_reg);
248	if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
249		t1 = force_string(tree_eval(t->re_exp));
250		unref(t->re_text);
251		t->re_text = dupnode(t1);
252		free_temp(t1);
253	}
254	t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
255				IGNORECASE);
256	t->re_flags &= ~CASE;
257	t->re_flags |= IGNORECASE;
258	return t->re_reg;
259}
260
261/* resetup --- choose what kind of regexps we match */
262
263void
264resetup()
265{
266	if (do_posix)
267		syn = RE_SYNTAX_POSIX_AWK;	/* strict POSIX re's */
268	else if (do_traditional)
269		syn = RE_SYNTAX_AWK;		/* traditional Unix awk re's */
270	else
271		syn = RE_SYNTAX_GNU_AWK;	/* POSIX re's + GNU ops */
272
273	/*
274	 * Interval expressions are off by default, since it's likely to
275	 * break too many old programs to have them on.
276	 */
277	if (do_intervals)
278		syn |= RE_INTERVALS;
279
280	(void) re_set_syntax(syn);
281}
282
283/* reisstring --- return TRUE if the RE match is a simple string match */
284
285int
286reisstring(const char *text, size_t len, Regexp *re, const char *buf)
287{
288	static char metas[] = ".*+(){}[]|?^$\\";
289	int i;
290	int res;
291	const char *matched;
292
293	/* simple checking for has meta characters in re */
294	for (i = 0; i < len; i++) {
295		if (strchr(metas, text[i]) != NULL) {
296			return FALSE;	/* give up early, can't be string match */
297		}
298	}
299
300	/* make accessable to gdb */
301	matched = &buf[RESTART(re, buf)];
302
303	res = STREQN(text, matched, len);
304
305	return res;
306}
307
308/* remaybelong --- return TRUE if the RE contains * ? | + */
309
310int
311remaybelong(const char *text, size_t len)
312{
313	while (len--) {
314		if (strchr("*+|?", *text++) != NULL) {
315			return TRUE;
316		}
317	}
318
319	return FALSE;
320}
321
322/* reflags2str --- make a regex flags value readable */
323
324const char *
325reflags2str(int flagval)
326{
327	static const struct flagtab values[] = {
328		{ RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
329		{ RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
330		{ RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
331		{ RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
332		{ RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
333		{ RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
334		{ RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
335		{ RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
336		{ RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
337		{ RE_INTERVALS, "RE_INTERVALS" },
338		{ RE_LIMITED_OPS, "RE_LIMITED_OPS" },
339		{ RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
340		{ RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
341		{ RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
342		{ RE_NO_BK_REFS, "RE_NO_BK_REFS" },
343		{ RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
344		{ RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
345		{ RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
346		{ RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
347		{ RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
348		{ RE_DEBUG, "RE_DEBUG" },
349		{ RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
350		{ RE_ICASE, "RE_ICASE" },
351		{ 0,	NULL },
352	};
353
354	return genflags2str(flagval, values);
355}
356