1/* Pattern Matchers for Regular Expressions. 2 Copyright (C) 1992, 1998, 2000, 2005 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software Foundation, 16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 17 18#ifdef HAVE_CONFIG_H 19# include <config.h> 20#endif 21 22/* Specification. */ 23#include "libgrep.h" 24 25#include <ctype.h> 26#include <stdlib.h> 27#include <string.h> 28 29#include "error.h" 30#include "exitfail.h" 31#include "xalloc.h" 32#include "m-common.h" 33 34/* This must be included _after_ m-common.h: It depends on MBS_SUPPORT. */ 35#include "dfa.h" 36 37#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 38# define IN_CTYPE_DOMAIN(c) 1 39#else 40# define IN_CTYPE_DOMAIN(c) isascii(c) 41#endif 42#define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C)) 43 44struct compiled_regex { 45 bool match_words; 46 bool match_lines; 47 char eolbyte; 48 49 /* DFA compiled regexp. */ 50 struct dfa dfa; 51 52 /* The Regex compiled patterns. */ 53 struct patterns 54 { 55 /* Regex compiled regexp. */ 56 struct re_pattern_buffer regexbuf; 57 struct re_registers regs; /* This is here on account of a BRAIN-DEAD 58 Q@#%!# library interface in regex.c. */ 59 } *patterns; 60 size_t pcount; 61 62 /* KWset compiled pattern. We compile a list of strings, at least one of 63 which is known to occur in any string matching the regexp. */ 64 struct compiled_kwset ckwset; 65 66 /* Number of compiled fixed strings known to exactly match the regexp. 67 If kwsexec returns < kwset_exact_matches, then we don't need to 68 call the regexp matcher at all. */ 69 int kwset_exact_matches; 70}; 71 72/* Callback from dfa.c. */ 73void 74dfaerror (const char *mesg) 75{ 76 error (exit_failure, 0, mesg); 77} 78 79/* If the DFA turns out to have some set of fixed strings one of 80 which must occur in the match, then we build a kwset matcher 81 to find those strings, and thus quickly filter out impossible 82 matches. */ 83static void 84kwsmusts (struct compiled_regex *cregex, 85 bool match_icase, bool match_words, bool match_lines, char eolbyte) 86{ 87 struct dfamust const *dm; 88 const char *err; 89 90 if (cregex->dfa.musts) 91 { 92 kwsinit (&cregex->ckwset, match_icase, match_words, match_lines, eolbyte); 93 /* First, we compile in the substrings known to be exact 94 matches. The kwset matcher will return the index 95 of the matching string that it chooses. */ 96 for (dm = cregex->dfa.musts; dm; dm = dm->next) 97 { 98 if (!dm->exact) 99 continue; 100 cregex->kwset_exact_matches++; 101 if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL) 102 error (exit_failure, 0, err); 103 } 104 /* Now, we compile the substrings that will require 105 the use of the regexp matcher. */ 106 for (dm = cregex->dfa.musts; dm; dm = dm->next) 107 { 108 if (dm->exact) 109 continue; 110 if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL) 111 error (exit_failure, 0, err); 112 } 113 if ((err = kwsprep (cregex->ckwset.kwset)) != NULL) 114 error (exit_failure, 0, err); 115 } 116} 117 118static void * 119Gcompile (const char *pattern, size_t pattern_size, 120 bool match_icase, bool match_words, bool match_lines, char eolbyte) 121{ 122 struct compiled_regex *cregex; 123 const char *err; 124 const char *sep; 125 size_t total = pattern_size; 126 const char *motif = pattern; 127 128 cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex)); 129 memset (cregex, '\0', sizeof (struct compiled_regex)); 130 cregex->match_words = match_words; 131 cregex->match_lines = match_lines; 132 cregex->eolbyte = eolbyte; 133 cregex->patterns = NULL; 134 cregex->pcount = 0; 135 136 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); 137 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); 138 139 /* For GNU regex compiler we have to pass the patterns separately to detect 140 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 141 GNU regex should have raise a syntax error. The same for backref, where 142 the backref should have been local to each pattern. */ 143 do 144 { 145 size_t len; 146 sep = memchr (motif, '\n', total); 147 if (sep) 148 { 149 len = sep - motif; 150 sep++; 151 total -= (len + 1); 152 } 153 else 154 { 155 len = total; 156 total = 0; 157 } 158 159 cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns)); 160 memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns)); 161 162 if ((err = re_compile_pattern (motif, len, 163 &(cregex->patterns[cregex->pcount].regexbuf))) != NULL) 164 error (exit_failure, 0, err); 165 cregex->pcount++; 166 167 motif = sep; 168 } while (sep && total != 0); 169 170 /* In the match_words and match_lines cases, we use a different pattern 171 for the DFA matcher that will quickly throw out cases that won't work. 172 Then if DFA succeeds we do some hairy stuff using the regex matcher 173 to decide whether the match should really count. */ 174 if (match_words || match_lines) 175 { 176 /* In the whole-word case, we use the pattern: 177 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\). 178 In the whole-line case, we use the pattern: 179 ^\(userpattern\)$. */ 180 181 static const char line_beg[] = "^\\("; 182 static const char line_end[] = "\\)$"; 183 static const char word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 184 static const char word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 185 char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end); 186 size_t i; 187 strcpy (n, match_lines ? line_beg : word_beg); 188 i = strlen (n); 189 memcpy (n + i, pattern, pattern_size); 190 i += pattern_size; 191 strcpy (n + i, match_lines ? line_end : word_end); 192 i += strlen (n + i); 193 pattern = n; 194 pattern_size = i; 195 } 196 197 dfacomp (pattern, pattern_size, &cregex->dfa, 1); 198 kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte); 199 200 return cregex; 201} 202 203static void * 204compile (const char *pattern, size_t pattern_size, 205 bool match_icase, bool match_words, bool match_lines, char eolbyte, 206 reg_syntax_t syntax) 207{ 208 struct compiled_regex *cregex; 209 const char *err; 210 const char *sep; 211 size_t total = pattern_size; 212 const char *motif = pattern; 213 214 cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex)); 215 memset (cregex, '\0', sizeof (struct compiled_regex)); 216 cregex->match_words = match_words; 217 cregex->match_lines = match_lines; 218 cregex->eolbyte = eolbyte; 219 cregex->patterns = NULL; 220 cregex->pcount = 0; 221 222 re_set_syntax (syntax); 223 dfasyntax (syntax, match_icase, eolbyte); 224 225 /* For GNU regex compiler we have to pass the patterns separately to detect 226 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 227 GNU regex should have raise a syntax error. The same for backref, where 228 the backref should have been local to each pattern. */ 229 do 230 { 231 size_t len; 232 sep = memchr (motif, '\n', total); 233 if (sep) 234 { 235 len = sep - motif; 236 sep++; 237 total -= (len + 1); 238 } 239 else 240 { 241 len = total; 242 total = 0; 243 } 244 245 cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns)); 246 memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns)); 247 248 if ((err = re_compile_pattern (motif, len, 249 &(cregex->patterns[cregex->pcount].regexbuf))) != NULL) 250 error (exit_failure, 0, err); 251 cregex->pcount++; 252 253 motif = sep; 254 } while (sep && total != 0); 255 256 /* In the match_words and match_lines cases, we use a different pattern 257 for the DFA matcher that will quickly throw out cases that won't work. 258 Then if DFA succeeds we do some hairy stuff using the regex matcher 259 to decide whether the match should really count. */ 260 if (match_words || match_lines) 261 { 262 /* In the whole-word case, we use the pattern: 263 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$). 264 In the whole-line case, we use the pattern: 265 ^(userpattern)$. */ 266 267 static const char line_beg[] = "^("; 268 static const char line_end[] = ")$"; 269 static const char word_beg[] = "(^|[^[:alnum:]_])("; 270 static const char word_end[] = ")([^[:alnum:]_]|$)"; 271 char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end); 272 size_t i; 273 strcpy (n, match_lines ? line_beg : word_beg); 274 i = strlen(n); 275 memcpy (n + i, pattern, pattern_size); 276 i += pattern_size; 277 strcpy (n + i, match_lines ? line_end : word_end); 278 i += strlen (n + i); 279 pattern = n; 280 pattern_size = i; 281 } 282 283 dfacomp (pattern, pattern_size, &cregex->dfa, 1); 284 kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte); 285 286 return cregex; 287} 288 289static void * 290Ecompile (const char *pattern, size_t pattern_size, 291 bool match_icase, bool match_words, bool match_lines, char eolbyte) 292{ 293 return compile (pattern, pattern_size, 294 match_icase, match_words, match_lines, eolbyte, 295 RE_SYNTAX_POSIX_EGREP); 296} 297 298static void * 299AWKcompile (const char *pattern, size_t pattern_size, 300 bool match_icase, bool match_words, bool match_lines, char eolbyte) 301{ 302 return compile (pattern, pattern_size, 303 match_icase, match_words, match_lines, eolbyte, 304 RE_SYNTAX_AWK); 305} 306 307static size_t 308EGexecute (const void *compiled_pattern, 309 const char *buf, size_t buf_size, 310 size_t *match_size, bool exact) 311{ 312 struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern; 313 register const char *buflim, *beg, *end; 314 char eol = cregex->eolbyte; 315 int backref, start, len; 316 struct kwsmatch kwsm; 317 size_t i; 318#ifdef MBS_SUPPORT 319 char *mb_properties = NULL; 320#endif /* MBS_SUPPORT */ 321 322#ifdef MBS_SUPPORT 323 if (MB_CUR_MAX > 1 && cregex->ckwset.kwset) 324 mb_properties = check_multibyte_string (buf, buf_size); 325#endif /* MBS_SUPPORT */ 326 327 buflim = buf + buf_size; 328 329 for (beg = end = buf; end < buflim; beg = end) 330 { 331 if (!exact) 332 { 333 if (cregex->ckwset.kwset) 334 { 335 /* Find a possible match using the KWset matcher. */ 336 size_t offset = kwsexec (cregex->ckwset.kwset, beg, buflim - beg, &kwsm); 337 if (offset == (size_t) -1) 338 { 339#ifdef MBS_SUPPORT 340 if (MB_CUR_MAX > 1) 341 free (mb_properties); 342#endif 343 return (size_t)-1; 344 } 345 beg += offset; 346 /* Narrow down to the line containing the candidate, and 347 run it through DFA. */ 348 end = memchr (beg, eol, buflim - beg); 349 if (end != NULL) 350 end++; 351 else 352 end = buflim; 353#ifdef MBS_SUPPORT 354 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) 355 continue; 356#endif 357 while (beg > buf && beg[-1] != eol) 358 --beg; 359 if (kwsm.index < cregex->kwset_exact_matches) 360 goto success; 361 if (dfaexec (&cregex->dfa, beg, end - beg, &backref) == (size_t) -1) 362 continue; 363 } 364 else 365 { 366 /* No good fixed strings; start with DFA. */ 367 size_t offset = dfaexec (&cregex->dfa, beg, buflim - beg, &backref); 368 if (offset == (size_t) -1) 369 break; 370 /* Narrow down to the line we've found. */ 371 beg += offset; 372 end = memchr (beg, eol, buflim - beg); 373 if (end != NULL) 374 end++; 375 else 376 end = buflim; 377 while (beg > buf && beg[-1] != eol) 378 --beg; 379 } 380 /* Successful, no backreferences encountered! */ 381 if (!backref) 382 goto success; 383 } 384 else 385 end = beg + buf_size; 386 387 /* If we've made it to this point, this means DFA has seen 388 a probable match, and we need to run it through Regex. */ 389 for (i = 0; i < cregex->pcount; i++) 390 { 391 cregex->patterns[i].regexbuf.not_eol = 0; 392 if (0 <= (start = re_search (&(cregex->patterns[i].regexbuf), beg, 393 end - beg - 1, 0, 394 end - beg - 1, &(cregex->patterns[i].regs)))) 395 { 396 len = cregex->patterns[i].regs.end[0] - start; 397 if (exact) 398 { 399 *match_size = len; 400 return start; 401 } 402 if ((!cregex->match_lines && !cregex->match_words) 403 || (cregex->match_lines && len == end - beg - 1)) 404 goto success; 405 /* If -w, check if the match aligns with word boundaries. 406 We do this iteratively because: 407 (a) the line may contain more than one occurence of the 408 pattern, and 409 (b) Several alternatives in the pattern might be valid at a 410 given point, and we may need to consider a shorter one to 411 find a word boundary. */ 412 if (cregex->match_words) 413 while (start >= 0) 414 { 415 if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1])) 416 && (len == end - beg - 1 417 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len]))) 418 goto success; 419 if (len > 0) 420 { 421 /* Try a shorter length anchored at the same place. */ 422 --len; 423 cregex->patterns[i].regexbuf.not_eol = 1; 424 len = re_match (&(cregex->patterns[i].regexbuf), beg, 425 start + len, start, 426 &(cregex->patterns[i].regs)); 427 } 428 if (len <= 0) 429 { 430 /* Try looking further on. */ 431 if (start == end - beg - 1) 432 break; 433 ++start; 434 cregex->patterns[i].regexbuf.not_eol = 0; 435 start = re_search (&(cregex->patterns[i].regexbuf), beg, 436 end - beg - 1, 437 start, end - beg - 1 - start, 438 &(cregex->patterns[i].regs)); 439 len = cregex->patterns[i].regs.end[0] - start; 440 } 441 } 442 } 443 } /* for Regex patterns. */ 444 } /* for (beg = end ..) */ 445#ifdef MBS_SUPPORT 446 if (MB_CUR_MAX > 1 && mb_properties) 447 free (mb_properties); 448#endif /* MBS_SUPPORT */ 449 return (size_t) -1; 450 451 success: 452#ifdef MBS_SUPPORT 453 if (MB_CUR_MAX > 1 && mb_properties) 454 free (mb_properties); 455#endif /* MBS_SUPPORT */ 456 *match_size = end - beg; 457 return beg - buf; 458} 459 460static void 461EGfree (void *compiled_pattern) 462{ 463 struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern; 464 465 dfafree (&cregex->dfa); 466 free (cregex->patterns); 467 free (cregex->ckwset.trans); 468 free (cregex); 469} 470 471/* POSIX Basic Regular Expressions */ 472matcher_t matcher_grep = 473 { 474 Gcompile, 475 EGexecute, 476 EGfree 477 }; 478 479/* POSIX Extended Regular Expressions */ 480matcher_t matcher_egrep = 481 { 482 Ecompile, 483 EGexecute, 484 EGfree 485 }; 486 487/* AWK Regular Expressions */ 488matcher_t matcher_awk = 489 { 490 AWKcompile, 491 EGexecute, 492 EGfree 493 }; 494