bin/gawk/re.c

/*
 * re.c - compile regular expressions.
 */

/*
 * Copyright (C) 1991-2003 the Free Software Foundation, Inc.
 *
 * This file is part of GAWK, the GNU implementation of the
 * AWK Programming Language.
 *
 * GAWK is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * GAWK is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
 */

#include "awk.h"

static reg_syntax_t syn;

/* make_regexp --- generate compiled regular expressions */

Regexp *
make_regexp(const char *s, size_t len, int ignorecase)
{
	Regexp *rp;
	const char *rerr;
	const char *src = s;
	char *temp;
	const char *end = s + len;
	register char *dest;
	register int c, c2;
#ifdef MBS_SUPPORT
	/* The number of bytes in the current multbyte character.
	   It is 0, when the current character is a singlebyte character.  */
	size_t is_multibyte = 0;
	mbstate_t mbs;

	if (gawk_mb_cur_max > 1)
		memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
#endif

	/* Handle escaped characters first. */

	/*
	 * Build a copy of the string (in dest) with the
	 * escaped characters translated, and generate the regex
	 * from that.
	 */
	emalloc(dest, char *, len + 2, "make_regexp");
	temp = dest;

	while (src < end) {
#ifdef MBS_SUPPORT
		if (gawk_mb_cur_max > 1 && !is_multibyte) {
			/* The previous byte is a singlebyte character, or last byte
			   of a multibyte character.  We check the next character.  */
			is_multibyte = mbrlen(src, end - src, &mbs);
			if ((is_multibyte == 1) || (is_multibyte == (size_t) -1)
				|| (is_multibyte == (size_t) -2 || (is_multibyte == 0))) {
				/* We treat it as a singlebyte character.  */
				is_multibyte = 0;
			}
		}
#endif

		if (
#ifdef MBS_SUPPORT
		/* We skip multibyte character, since it must not be a special
		   character.  */
		    (gawk_mb_cur_max == 1 || ! is_multibyte) &&
#endif
		    (*src == '\\')) {
			c = *++src;
			switch (c) {
			case 'a':
			case 'b':
			case 'f':
			case 'n':
			case 'r':
			case 't':
			case 'v':
			case 'x':
			case '0':
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
				c2 = parse_escape(&src);
				if (c2 < 0)
					cant_happen();
				/*
				 * Unix awk treats octal (and hex?) chars
				 * literally in re's, so escape regexp
				 * metacharacters.
				 */
				if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x')
				    && strchr("()|*+?.^$\\[]", c2) != NULL)
					*dest++ = '\\';
				*dest++ = (char) c2;
				break;
			case '8':
			case '9':	/* a\9b not valid */
				*dest++ = c;
				src++;
				break;
			case 'y':	/* normally \b */
				/* gnu regex op */
				if (! do_traditional) {
					*dest++ = '\\';
					*dest++ = 'b';
					src++;
					break;
				}
				/* else, fall through */
			default:
				*dest++ = '\\';
				*dest++ = (char) c;
				src++;
				break;
			} /* switch */
		} else
			*dest++ = *src++;	/* not '\\' */
#ifdef MBS_SUPPORT
		if (gawk_mb_cur_max > 1 && is_multibyte)
			is_multibyte--;
#endif
	} /* while */

	*dest = '\0' ;	/* Only necessary if we print dest ? */
	emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
	memset((char *) rp, 0, sizeof(*rp));
	rp->pat.allocated = 0;	/* regex will allocate the buffer */
	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");

	if (ignorecase)
		rp->pat.translate = casetable;
	else
		rp->pat.translate = NULL;
	len = dest - temp;
	if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
		fatal("%s: /%s/", rerr, temp);	/* rerr already gettextized inside regex routines */

	/* gack. this must be done *after* re_compile_pattern */
	rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */

	free(temp);
	return rp;
}

/* research --- do a regexp search */

int
research(Regexp *rp, register const char *str, int start,
	register size_t len, int need_start)
{
	const char *ret = str;

	if (ret) {
		/*
		 * Passing NULL as last arg speeds up search for cases
		 * where we don't need the start/end info.
		 */
		int res = re_search(&(rp->pat), str, start+len,
				start, len, need_start ? &(rp->regs) : NULL);

		/*
		 * A return of -2 indicates that a heuristic in
		 * regex decided it might allocate too much memory
		 * on the C stack. This doesn't apply to gawk, which
		 * uses REGEX_MALLOC. This is dealt with by the
		 * assignment to re_max_failures in resetup().
		 * Naetheless, we keep this code here as a fallback.
		 *
		 * XXX: The above comment is obsolete; the new regex
		 * doesn't have an re_max_failures variable. But we
		 * keep the code here just in case.
		 */
		if (res == -2) {
			/* the 10 here is arbitrary */
			fatal(_("regex match failed, not enough memory to match string \"%.*s%s\""),
					(int) (len > 10 ? 10 : len), str + start,
					len > 10 ? "..." : "");
		}
		return res;
	} else
		return -1;
}

/* refree --- free up the dynamic memory used by a compiled regexp */

void
refree(Regexp *rp)
{
	/*
	 * This isn't malloced, don't let regfree free it.
	 * (This is strictly necessary only for the old
	 * version of regex, but it's a good idea to keep it
	 * here in case regex internals change in the future.)
	 */
	rp->pat.translate = NULL;

	regfree(& rp->pat);
	if (rp->regs.start)
		free(rp->regs.start);
	if (rp->regs.end)
		free(rp->regs.end);
	free(rp);
}

/* re_update --- recompile a dynamic regexp */

Regexp *
re_update(NODE *t)
{
	NODE *t1;

	if ((t->re_flags & CASE) == IGNORECASE) {
		if ((t->re_flags & CONST) != 0) {
			assert(t->type == Node_regex);
			return t->re_reg;
		}
		t1 = force_string(tree_eval(t->re_exp));
		if (t->re_text != NULL) {
			if (cmp_nodes(t->re_text, t1) == 0) {
				free_temp(t1);
				return t->re_reg;
			}
			unref(t->re_text);
		}
		t->re_text = dupnode(t1);
		free_temp(t1);
	}
	if (t->re_reg != NULL)
		refree(t->re_reg);
	if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
		t1 = force_string(tree_eval(t->re_exp));
		unref(t->re_text);
		t->re_text = dupnode(t1);
		free_temp(t1);
	}
	t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
				IGNORECASE);
	t->re_flags &= ~CASE;
	t->re_flags |= IGNORECASE;
	return t->re_reg;
}

/* resetup --- choose what kind of regexps we match */

void
resetup()
{
	if (do_posix)
		syn = RE_SYNTAX_POSIX_AWK;	/* strict POSIX re's */
	else if (do_traditional)
		syn = RE_SYNTAX_AWK;		/* traditional Unix awk re's */
	else
		syn = RE_SYNTAX_GNU_AWK;	/* POSIX re's + GNU ops */

	/*
	 * Interval expressions are off by default, since it's likely to
	 * break too many old programs to have them on.
	 */
	if (do_intervals)
		syn |= RE_INTERVALS;

	(void) re_set_syntax(syn);
}

/* reisstring --- return TRUE if the RE match is a simple string match */

int
reisstring(const char *text, size_t len, Regexp *re, const char *buf)
{
	static char metas[] = ".*+(){}[]|?^$\\";
	int i;
	int res;
	const char *matched;

	/* simple checking for has meta characters in re */
	for (i = 0; i < len; i++) {
		if (strchr(metas, text[i]) != NULL) {
			return FALSE;	/* give up early, can't be string match */
		}
	}

	/* make accessable to gdb */
	matched = &buf[RESTART(re, buf)];

	res = STREQN(text, matched, len);

	return res;
}

/* remaybelong --- return TRUE if the RE contains * ? | + */

int
remaybelong(const char *text, size_t len)
{
	while (len--) {
		if (strchr("*+|?", *text++) != NULL) {
			return TRUE;
		}
	}

	return FALSE;
}

/* reflags2str --- make a regex flags value readable */

const char *
reflags2str(int flagval)
{
	static const struct flagtab values[] = {
		{ RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
		{ RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
		{ RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
		{ RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
		{ RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
		{ RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
		{ RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
		{ RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
		{ RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
		{ RE_INTERVALS, "RE_INTERVALS" },
		{ RE_LIMITED_OPS, "RE_LIMITED_OPS" },
		{ RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
		{ RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
		{ RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
		{ RE_NO_BK_REFS, "RE_NO_BK_REFS" },
		{ RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
		{ RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
		{ RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
		{ RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
		{ RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
		{ RE_DEBUG, "RE_DEBUG" },
		{ RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
		{ RE_ICASE, "RE_ICASE" },
		{ 0,	NULL },
	};

	return genflags2str(flagval, values);
}