1/* 2 * re.c - compile regular expressions. 3 */ 4 5/* 6 * Copyright (C) 1991-2003 the Free Software Foundation, Inc. 7 * 8 * This file is part of GAWK, the GNU implementation of the 9 * AWK Programming Language. 10 * 11 * GAWK is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2 of the License, or 14 * (at your option) any later version. 15 * 16 * GAWK is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program; if not, write to the Free Software 23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA 24 */ 25 26#include "awk.h" 27 28static reg_syntax_t syn; 29 30/* make_regexp --- generate compiled regular expressions */ 31 32Regexp * 33make_regexp(const char *s, size_t len, int ignorecase) 34{ 35 Regexp *rp; 36 const char *rerr; 37 const char *src = s; 38 char *temp; 39 const char *end = s + len; 40 register char *dest; 41 register int c, c2; 42#ifdef MBS_SUPPORT 43 /* The number of bytes in the current multbyte character. 44 It is 0, when the current character is a singlebyte character. */ 45 size_t is_multibyte = 0; 46 mbstate_t mbs; 47 48 if (gawk_mb_cur_max > 1) 49 memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */ 50#endif 51 52 /* Handle escaped characters first. */ 53 54 /* 55 * Build a copy of the string (in dest) with the 56 * escaped characters translated, and generate the regex 57 * from that. 58 */ 59 emalloc(dest, char *, len + 2, "make_regexp"); 60 temp = dest; 61 62 while (src < end) { 63#ifdef MBS_SUPPORT 64 if (gawk_mb_cur_max > 1 && !is_multibyte) { 65 /* The previous byte is a singlebyte character, or last byte 66 of a multibyte character. We check the next character. */ 67 is_multibyte = mbrlen(src, end - src, &mbs); 68 if ((is_multibyte == 1) || (is_multibyte == (size_t) -1) 69 || (is_multibyte == (size_t) -2 || (is_multibyte == 0))) { 70 /* We treat it as a singlebyte character. */ 71 is_multibyte = 0; 72 } 73 } 74#endif 75 76 if ( 77#ifdef MBS_SUPPORT 78 /* We skip multibyte character, since it must not be a special 79 character. */ 80 (gawk_mb_cur_max == 1 || ! is_multibyte) && 81#endif 82 (*src == '\\')) { 83 c = *++src; 84 switch (c) { 85 case 'a': 86 case 'b': 87 case 'f': 88 case 'n': 89 case 'r': 90 case 't': 91 case 'v': 92 case 'x': 93 case '0': 94 case '1': 95 case '2': 96 case '3': 97 case '4': 98 case '5': 99 case '6': 100 case '7': 101 c2 = parse_escape(&src); 102 if (c2 < 0) 103 cant_happen(); 104 /* 105 * Unix awk treats octal (and hex?) chars 106 * literally in re's, so escape regexp 107 * metacharacters. 108 */ 109 if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x') 110 && strchr("()|*+?.^$\\[]", c2) != NULL) 111 *dest++ = '\\'; 112 *dest++ = (char) c2; 113 break; 114 case '8': 115 case '9': /* a\9b not valid */ 116 *dest++ = c; 117 src++; 118 break; 119 case 'y': /* normally \b */ 120 /* gnu regex op */ 121 if (! do_traditional) { 122 *dest++ = '\\'; 123 *dest++ = 'b'; 124 src++; 125 break; 126 } 127 /* else, fall through */ 128 default: 129 *dest++ = '\\'; 130 *dest++ = (char) c; 131 src++; 132 break; 133 } /* switch */ 134 } else 135 *dest++ = *src++; /* not '\\' */ 136#ifdef MBS_SUPPORT 137 if (gawk_mb_cur_max > 1 && is_multibyte) 138 is_multibyte--; 139#endif 140 } /* while */ 141 142 *dest = '\0' ; /* Only necessary if we print dest ? */ 143 emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); 144 memset((char *) rp, 0, sizeof(*rp)); 145 rp->pat.allocated = 0; /* regex will allocate the buffer */ 146 emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); 147 148 if (ignorecase) 149 rp->pat.translate = casetable; 150 else 151 rp->pat.translate = NULL; 152 len = dest - temp; 153 if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL) 154 fatal("%s: /%s/", rerr, temp); /* rerr already gettextized inside regex routines */ 155 156 /* gack. this must be done *after* re_compile_pattern */ 157 rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */ 158 159 free(temp); 160 return rp; 161} 162 163/* research --- do a regexp search */ 164 165int 166research(Regexp *rp, register const char *str, int start, 167 register size_t len, int need_start) 168{ 169 const char *ret = str; 170 171 if (ret) { 172 /* 173 * Passing NULL as last arg speeds up search for cases 174 * where we don't need the start/end info. 175 */ 176 int res = re_search(&(rp->pat), str, start+len, 177 start, len, need_start ? &(rp->regs) : NULL); 178 179 /* 180 * A return of -2 indicates that a heuristic in 181 * regex decided it might allocate too much memory 182 * on the C stack. This doesn't apply to gawk, which 183 * uses REGEX_MALLOC. This is dealt with by the 184 * assignment to re_max_failures in resetup(). 185 * Naetheless, we keep this code here as a fallback. 186 * 187 * XXX: The above comment is obsolete; the new regex 188 * doesn't have an re_max_failures variable. But we 189 * keep the code here just in case. 190 */ 191 if (res == -2) { 192 /* the 10 here is arbitrary */ 193 fatal(_("regex match failed, not enough memory to match string \"%.*s%s\""), 194 (int) (len > 10 ? 10 : len), str + start, 195 len > 10 ? "..." : ""); 196 } 197 return res; 198 } else 199 return -1; 200} 201 202/* refree --- free up the dynamic memory used by a compiled regexp */ 203 204void 205refree(Regexp *rp) 206{ 207 /* 208 * This isn't malloced, don't let regfree free it. 209 * (This is strictly necessary only for the old 210 * version of regex, but it's a good idea to keep it 211 * here in case regex internals change in the future.) 212 */ 213 rp->pat.translate = NULL; 214 215 regfree(& rp->pat); 216 if (rp->regs.start) 217 free(rp->regs.start); 218 if (rp->regs.end) 219 free(rp->regs.end); 220 free(rp); 221} 222 223/* re_update --- recompile a dynamic regexp */ 224 225Regexp * 226re_update(NODE *t) 227{ 228 NODE *t1; 229 230 if ((t->re_flags & CASE) == IGNORECASE) { 231 if ((t->re_flags & CONST) != 0) { 232 assert(t->type == Node_regex); 233 return t->re_reg; 234 } 235 t1 = force_string(tree_eval(t->re_exp)); 236 if (t->re_text != NULL) { 237 if (cmp_nodes(t->re_text, t1) == 0) { 238 free_temp(t1); 239 return t->re_reg; 240 } 241 unref(t->re_text); 242 } 243 t->re_text = dupnode(t1); 244 free_temp(t1); 245 } 246 if (t->re_reg != NULL) 247 refree(t->re_reg); 248 if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) { 249 t1 = force_string(tree_eval(t->re_exp)); 250 unref(t->re_text); 251 t->re_text = dupnode(t1); 252 free_temp(t1); 253 } 254 t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen, 255 IGNORECASE); 256 t->re_flags &= ~CASE; 257 t->re_flags |= IGNORECASE; 258 return t->re_reg; 259} 260 261/* resetup --- choose what kind of regexps we match */ 262 263void 264resetup() 265{ 266 if (do_posix) 267 syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */ 268 else if (do_traditional) 269 syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */ 270 else 271 syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */ 272 273 /* 274 * Interval expressions are off by default, since it's likely to 275 * break too many old programs to have them on. 276 */ 277 if (do_intervals) 278 syn |= RE_INTERVALS; 279 280 (void) re_set_syntax(syn); 281} 282 283/* reisstring --- return TRUE if the RE match is a simple string match */ 284 285int 286reisstring(const char *text, size_t len, Regexp *re, const char *buf) 287{ 288 static char metas[] = ".*+(){}[]|?^$\\"; 289 int i; 290 int res; 291 const char *matched; 292 293 /* simple checking for has meta characters in re */ 294 for (i = 0; i < len; i++) { 295 if (strchr(metas, text[i]) != NULL) { 296 return FALSE; /* give up early, can't be string match */ 297 } 298 } 299 300 /* make accessable to gdb */ 301 matched = &buf[RESTART(re, buf)]; 302 303 res = STREQN(text, matched, len); 304 305 return res; 306} 307 308/* remaybelong --- return TRUE if the RE contains * ? | + */ 309 310int 311remaybelong(const char *text, size_t len) 312{ 313 while (len--) { 314 if (strchr("*+|?", *text++) != NULL) { 315 return TRUE; 316 } 317 } 318 319 return FALSE; 320} 321 322/* reflags2str --- make a regex flags value readable */ 323 324const char * 325reflags2str(int flagval) 326{ 327 static const struct flagtab values[] = { 328 { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" }, 329 { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" }, 330 { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" }, 331 { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" }, 332 { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" }, 333 { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" }, 334 { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" }, 335 { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" }, 336 { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" }, 337 { RE_INTERVALS, "RE_INTERVALS" }, 338 { RE_LIMITED_OPS, "RE_LIMITED_OPS" }, 339 { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" }, 340 { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" }, 341 { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" }, 342 { RE_NO_BK_REFS, "RE_NO_BK_REFS" }, 343 { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" }, 344 { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" }, 345 { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" }, 346 { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" }, 347 { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" }, 348 { RE_DEBUG, "RE_DEBUG" }, 349 { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" }, 350 { RE_ICASE, "RE_ICASE" }, 351 { 0, NULL }, 352 }; 353 354 return genflags2str(flagval, values); 355} 356