1/* Pattern Matchers for Regular Expressions. 2 Copyright (C) 1992, 1998, 2000, 2005-2006 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17#ifdef HAVE_CONFIG_H 18# include <config.h> 19#endif 20 21/* Specification. */ 22#include "libgrep.h" 23 24#include <ctype.h> 25#include <stdlib.h> 26#include <string.h> 27 28#include "error.h" 29#include "exitfail.h" 30#include "xalloc.h" 31#include "m-common.h" 32 33/* This must be included _after_ m-common.h: It depends on MBS_SUPPORT. */ 34#include "dfa.h" 35 36#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 37# define IN_CTYPE_DOMAIN(c) 1 38#else 39# define IN_CTYPE_DOMAIN(c) isascii(c) 40#endif 41#define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C)) 42 43struct patterns 44{ 45 /* Regex compiled regexp. */ 46 struct re_pattern_buffer regexbuf; 47 struct re_registers regs; /* This is here on account of a BRAIN-DEAD 48 Q@#%!# library interface in regex.c. */ 49}; 50 51struct compiled_regex { 52 bool match_words; 53 bool match_lines; 54 char eolbyte; 55 56 /* DFA compiled regexp. */ 57 struct dfa dfa; 58 59 /* The Regex compiled patterns. */ 60 struct patterns *patterns; 61 size_t pcount; 62 63 /* KWset compiled pattern. We compile a list of strings, at least one of 64 which is known to occur in any string matching the regexp. */ 65 struct compiled_kwset ckwset; 66 67 /* Number of compiled fixed strings known to exactly match the regexp. 68 If kwsexec returns < kwset_exact_matches, then we don't need to 69 call the regexp matcher at all. */ 70 int kwset_exact_matches; 71}; 72 73/* Callback from dfa.c. */ 74void 75dfaerror (const char *mesg) 76{ 77 error (exit_failure, 0, mesg); 78} 79 80/* If the DFA turns out to have some set of fixed strings one of 81 which must occur in the match, then we build a kwset matcher 82 to find those strings, and thus quickly filter out impossible 83 matches. */ 84static void 85kwsmusts (struct compiled_regex *cregex, 86 bool match_icase, bool match_words, bool match_lines, char eolbyte) 87{ 88 struct dfamust const *dm; 89 const char *err; 90 91 if (cregex->dfa.musts) 92 { 93 kwsinit (&cregex->ckwset, match_icase, match_words, match_lines, eolbyte); 94 /* First, we compile in the substrings known to be exact 95 matches. The kwset matcher will return the index 96 of the matching string that it chooses. */ 97 for (dm = cregex->dfa.musts; dm; dm = dm->next) 98 { 99 if (!dm->exact) 100 continue; 101 cregex->kwset_exact_matches++; 102 if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL) 103 error (exit_failure, 0, err); 104 } 105 /* Now, we compile the substrings that will require 106 the use of the regexp matcher. */ 107 for (dm = cregex->dfa.musts; dm; dm = dm->next) 108 { 109 if (dm->exact) 110 continue; 111 if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL) 112 error (exit_failure, 0, err); 113 } 114 if ((err = kwsprep (cregex->ckwset.kwset)) != NULL) 115 error (exit_failure, 0, err); 116 } 117} 118 119static void * 120Gcompile (const char *pattern, size_t pattern_size, 121 bool match_icase, bool match_words, bool match_lines, char eolbyte) 122{ 123 struct compiled_regex *cregex; 124 const char *err; 125 const char *sep; 126 size_t total = pattern_size; 127 const char *motif = pattern; 128 129 cregex = XMALLOC (struct compiled_regex); 130 memset (cregex, '\0', sizeof (struct compiled_regex)); 131 cregex->match_words = match_words; 132 cregex->match_lines = match_lines; 133 cregex->eolbyte = eolbyte; 134 cregex->patterns = NULL; 135 cregex->pcount = 0; 136 137 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); 138 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); 139 140 /* For GNU regex compiler we have to pass the patterns separately to detect 141 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 142 GNU regex should have raise a syntax error. The same for backref, where 143 the backref should have been local to each pattern. */ 144 do 145 { 146 size_t len; 147 sep = (const char *) memchr (motif, '\n', total); 148 if (sep) 149 { 150 len = sep - motif; 151 sep++; 152 total -= (len + 1); 153 } 154 else 155 { 156 len = total; 157 total = 0; 158 } 159 160 cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns)); 161 memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns)); 162 163 if ((err = re_compile_pattern (motif, len, 164 &(cregex->patterns[cregex->pcount].regexbuf))) != NULL) 165 error (exit_failure, 0, err); 166 cregex->pcount++; 167 168 motif = sep; 169 } while (sep && total != 0); 170 171 /* In the match_words and match_lines cases, we use a different pattern 172 for the DFA matcher that will quickly throw out cases that won't work. 173 Then if DFA succeeds we do some hairy stuff using the regex matcher 174 to decide whether the match should really count. */ 175 if (match_words || match_lines) 176 { 177 /* In the whole-word case, we use the pattern: 178 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\). 179 In the whole-line case, we use the pattern: 180 ^\(userpattern\)$. */ 181 182 static const char line_beg[] = "^\\("; 183 static const char line_end[] = "\\)$"; 184 static const char word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 185 static const char word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 186 char *n = XNMALLOC (sizeof word_beg - 1 + pattern_size + sizeof word_end, char); 187 size_t i; 188 strcpy (n, match_lines ? line_beg : word_beg); 189 i = strlen (n); 190 memcpy (n + i, pattern, pattern_size); 191 i += pattern_size; 192 strcpy (n + i, match_lines ? line_end : word_end); 193 i += strlen (n + i); 194 pattern = n; 195 pattern_size = i; 196 } 197 198 dfacomp (pattern, pattern_size, &cregex->dfa, 1); 199 kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte); 200 201 return cregex; 202} 203 204static void * 205compile (const char *pattern, size_t pattern_size, 206 bool match_icase, bool match_words, bool match_lines, char eolbyte, 207 reg_syntax_t syntax) 208{ 209 struct compiled_regex *cregex; 210 const char *err; 211 const char *sep; 212 size_t total = pattern_size; 213 const char *motif = pattern; 214 215 cregex = XMALLOC (struct compiled_regex); 216 memset (cregex, '\0', sizeof (struct compiled_regex)); 217 cregex->match_words = match_words; 218 cregex->match_lines = match_lines; 219 cregex->eolbyte = eolbyte; 220 cregex->patterns = NULL; 221 cregex->pcount = 0; 222 223 re_set_syntax (syntax); 224 dfasyntax (syntax, match_icase, eolbyte); 225 226 /* For GNU regex compiler we have to pass the patterns separately to detect 227 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 228 GNU regex should have raise a syntax error. The same for backref, where 229 the backref should have been local to each pattern. */ 230 do 231 { 232 size_t len; 233 sep = (const char *) memchr (motif, '\n', total); 234 if (sep) 235 { 236 len = sep - motif; 237 sep++; 238 total -= (len + 1); 239 } 240 else 241 { 242 len = total; 243 total = 0; 244 } 245 246 cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns)); 247 memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns)); 248 249 if ((err = re_compile_pattern (motif, len, 250 &(cregex->patterns[cregex->pcount].regexbuf))) != NULL) 251 error (exit_failure, 0, err); 252 cregex->pcount++; 253 254 motif = sep; 255 } while (sep && total != 0); 256 257 /* In the match_words and match_lines cases, we use a different pattern 258 for the DFA matcher that will quickly throw out cases that won't work. 259 Then if DFA succeeds we do some hairy stuff using the regex matcher 260 to decide whether the match should really count. */ 261 if (match_words || match_lines) 262 { 263 /* In the whole-word case, we use the pattern: 264 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$). 265 In the whole-line case, we use the pattern: 266 ^(userpattern)$. */ 267 268 static const char line_beg[] = "^("; 269 static const char line_end[] = ")$"; 270 static const char word_beg[] = "(^|[^[:alnum:]_])("; 271 static const char word_end[] = ")([^[:alnum:]_]|$)"; 272 char *n = XNMALLOC (sizeof word_beg - 1 + pattern_size + sizeof word_end, char); 273 size_t i; 274 strcpy (n, match_lines ? line_beg : word_beg); 275 i = strlen(n); 276 memcpy (n + i, pattern, pattern_size); 277 i += pattern_size; 278 strcpy (n + i, match_lines ? line_end : word_end); 279 i += strlen (n + i); 280 pattern = n; 281 pattern_size = i; 282 } 283 284 dfacomp (pattern, pattern_size, &cregex->dfa, 1); 285 kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte); 286 287 return cregex; 288} 289 290static void * 291Ecompile (const char *pattern, size_t pattern_size, 292 bool match_icase, bool match_words, bool match_lines, char eolbyte) 293{ 294 return compile (pattern, pattern_size, 295 match_icase, match_words, match_lines, eolbyte, 296 RE_SYNTAX_POSIX_EGREP); 297} 298 299static void * 300AWKcompile (const char *pattern, size_t pattern_size, 301 bool match_icase, bool match_words, bool match_lines, char eolbyte) 302{ 303 return compile (pattern, pattern_size, 304 match_icase, match_words, match_lines, eolbyte, 305 RE_SYNTAX_AWK); 306} 307 308static size_t 309EGexecute (const void *compiled_pattern, 310 const char *buf, size_t buf_size, 311 size_t *match_size, bool exact) 312{ 313 struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern; 314 register const char *buflim, *beg, *end; 315 char eol = cregex->eolbyte; 316 int backref, start, len; 317 struct kwsmatch kwsm; 318 size_t i; 319#ifdef MBS_SUPPORT 320 char *mb_properties = NULL; 321#endif /* MBS_SUPPORT */ 322 323#ifdef MBS_SUPPORT 324 if (MB_CUR_MAX > 1 && cregex->ckwset.kwset) 325 mb_properties = check_multibyte_string (buf, buf_size); 326#endif /* MBS_SUPPORT */ 327 328 buflim = buf + buf_size; 329 330 for (beg = end = buf; end < buflim; beg = end) 331 { 332 if (!exact) 333 { 334 if (cregex->ckwset.kwset) 335 { 336 /* Find a possible match using the KWset matcher. */ 337 size_t offset = kwsexec (cregex->ckwset.kwset, beg, buflim - beg, &kwsm); 338 if (offset == (size_t) -1) 339 { 340#ifdef MBS_SUPPORT 341 if (MB_CUR_MAX > 1) 342 free (mb_properties); 343#endif 344 return (size_t)-1; 345 } 346 beg += offset; 347 /* Narrow down to the line containing the candidate, and 348 run it through DFA. */ 349 end = (const char *) memchr (beg, eol, buflim - beg); 350 if (end != NULL) 351 end++; 352 else 353 end = buflim; 354#ifdef MBS_SUPPORT 355 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) 356 continue; 357#endif 358 while (beg > buf && beg[-1] != eol) 359 --beg; 360 if (kwsm.index < cregex->kwset_exact_matches) 361 goto success; 362 if (dfaexec (&cregex->dfa, beg, end - beg, &backref) == (size_t) -1) 363 continue; 364 } 365 else 366 { 367 /* No good fixed strings; start with DFA. */ 368 size_t offset = dfaexec (&cregex->dfa, beg, buflim - beg, &backref); 369 if (offset == (size_t) -1) 370 break; 371 /* Narrow down to the line we've found. */ 372 beg += offset; 373 end = (const char *) memchr (beg, eol, buflim - beg); 374 if (end != NULL) 375 end++; 376 else 377 end = buflim; 378 while (beg > buf && beg[-1] != eol) 379 --beg; 380 } 381 /* Successful, no backreferences encountered! */ 382 if (!backref) 383 goto success; 384 } 385 else 386 end = beg + buf_size; 387 388 /* If we've made it to this point, this means DFA has seen 389 a probable match, and we need to run it through Regex. */ 390 for (i = 0; i < cregex->pcount; i++) 391 { 392 cregex->patterns[i].regexbuf.not_eol = 0; 393 if (0 <= (start = re_search (&(cregex->patterns[i].regexbuf), beg, 394 end - beg - 1, 0, 395 end - beg - 1, &(cregex->patterns[i].regs)))) 396 { 397 len = cregex->patterns[i].regs.end[0] - start; 398 if (exact) 399 { 400 *match_size = len; 401 return start; 402 } 403 if ((!cregex->match_lines && !cregex->match_words) 404 || (cregex->match_lines && len == end - beg - 1)) 405 goto success; 406 /* If -w, check if the match aligns with word boundaries. 407 We do this iteratively because: 408 (a) the line may contain more than one occurence of the 409 pattern, and 410 (b) Several alternatives in the pattern might be valid at a 411 given point, and we may need to consider a shorter one to 412 find a word boundary. */ 413 if (cregex->match_words) 414 while (start >= 0) 415 { 416 if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1])) 417 && (len == end - beg - 1 418 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len]))) 419 goto success; 420 if (len > 0) 421 { 422 /* Try a shorter length anchored at the same place. */ 423 --len; 424 cregex->patterns[i].regexbuf.not_eol = 1; 425 len = re_match (&(cregex->patterns[i].regexbuf), beg, 426 start + len, start, 427 &(cregex->patterns[i].regs)); 428 } 429 if (len <= 0) 430 { 431 /* Try looking further on. */ 432 if (start == end - beg - 1) 433 break; 434 ++start; 435 cregex->patterns[i].regexbuf.not_eol = 0; 436 start = re_search (&(cregex->patterns[i].regexbuf), beg, 437 end - beg - 1, 438 start, end - beg - 1 - start, 439 &(cregex->patterns[i].regs)); 440 len = cregex->patterns[i].regs.end[0] - start; 441 } 442 } 443 } 444 } /* for Regex patterns. */ 445 } /* for (beg = end ..) */ 446#ifdef MBS_SUPPORT 447 if (MB_CUR_MAX > 1 && mb_properties) 448 free (mb_properties); 449#endif /* MBS_SUPPORT */ 450 return (size_t) -1; 451 452 success: 453#ifdef MBS_SUPPORT 454 if (MB_CUR_MAX > 1 && mb_properties) 455 free (mb_properties); 456#endif /* MBS_SUPPORT */ 457 *match_size = end - beg; 458 return beg - buf; 459} 460 461static void 462EGfree (void *compiled_pattern) 463{ 464 struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern; 465 466 dfafree (&cregex->dfa); 467 free (cregex->patterns); 468 free (cregex->ckwset.trans); 469 free (cregex); 470} 471 472/* POSIX Basic Regular Expressions */ 473matcher_t matcher_grep = 474 { 475 Gcompile, 476 EGexecute, 477 EGfree 478 }; 479 480/* POSIX Extended Regular Expressions */ 481matcher_t matcher_egrep = 482 { 483 Ecompile, 484 EGexecute, 485 EGfree 486 }; 487 488/* AWK Regular Expressions */ 489matcher_t matcher_awk = 490 { 491 AWKcompile, 492 EGexecute, 493 EGfree 494 }; 495