1/* $NetBSD$ */ 2 3/* search.c - searching subroutines using dfa, kwset and regex for grep. 4 Copyright 1992, 1998, 2000 Free Software Foundation, Inc. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software 18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 19 02111-1307, USA. */ 20 21/* Written August 1992 by Mike Haertel. */ 22 23#ifdef HAVE_CONFIG_H 24# include <config.h> 25#endif 26#include <sys/types.h> 27#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC && defined HAVE_WCTYPE 28/* We can handle multibyte string. */ 29# define MBS_SUPPORT 30# include <wchar.h> 31# include <wctype.h> 32#endif 33 34#include "system.h" 35#include "grep.h" 36#include "regex.h" 37#include "dfa.h" 38#include "kwset.h" 39#include "error.h" 40#include "xalloc.h" 41#ifdef HAVE_LIBPCRE 42# include <pcre.h> 43#endif 44 45#define NCHAR (UCHAR_MAX + 1) 46 47/* For -w, we also consider _ to be word constituent. */ 48#define WCHAR(C) (ISALNUM(C) || (C) == '_') 49 50/* DFA compiled regexp. */ 51static struct dfa dfa; 52 53/* The Regex compiled patterns. */ 54static struct patterns 55{ 56 /* Regex compiled regexp. */ 57 struct re_pattern_buffer regexbuf; 58 struct re_registers regs; /* This is here on account of a BRAIN-DEAD 59 Q@#%!# library interface in regex.c. */ 60} patterns0; 61 62struct patterns *patterns; 63size_t pcount; 64 65/* KWset compiled pattern. For Ecompile and Gcompile, we compile 66 a list of strings, at least one of which is known to occur in 67 any string matching the regexp. */ 68static kwset_t kwset; 69 70/* Number of compiled fixed strings known to exactly match the regexp. 71 If kwsexec returns < kwset_exact_matches, then we don't need to 72 call the regexp matcher at all. */ 73static int kwset_exact_matches; 74 75#if defined(MBS_SUPPORT) 76static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); 77#endif 78static void kwsinit PARAMS ((void)); 79static void kwsmusts PARAMS ((void)); 80static void Gcompile PARAMS ((char const *, size_t)); 81static void Ecompile PARAMS ((char const *, size_t)); 82static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int )); 83static void Fcompile PARAMS ((char const *, size_t)); 84static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int)); 85static void Pcompile PARAMS ((char const *, size_t )); 86static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); 87 88void 89dfaerror (char const *mesg) 90{ 91 error (2, 0, mesg); 92} 93 94static void 95kwsinit (void) 96{ 97 static char trans[NCHAR]; 98 int i; 99 100 if (match_icase) 101 for (i = 0; i < NCHAR; ++i) 102 trans[i] = TOLOWER (i); 103 104 if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0))) 105 error (2, 0, _("memory exhausted")); 106} 107 108/* If the DFA turns out to have some set of fixed strings one of 109 which must occur in the match, then we build a kwset matcher 110 to find those strings, and thus quickly filter out impossible 111 matches. */ 112static void 113kwsmusts (void) 114{ 115 struct dfamust const *dm; 116 char const *err; 117 118 if (dfa.musts) 119 { 120 kwsinit (); 121 /* First, we compile in the substrings known to be exact 122 matches. The kwset matcher will return the index 123 of the matching string that it chooses. */ 124 for (dm = dfa.musts; dm; dm = dm->next) 125 { 126 if (!dm->exact) 127 continue; 128 ++kwset_exact_matches; 129 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) 130 error (2, 0, err); 131 } 132 /* Now, we compile the substrings that will require 133 the use of the regexp matcher. */ 134 for (dm = dfa.musts; dm; dm = dm->next) 135 { 136 if (dm->exact) 137 continue; 138 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) 139 error (2, 0, err); 140 } 141 if ((err = kwsprep (kwset)) != 0) 142 error (2, 0, err); 143 } 144} 145 146#ifdef MBS_SUPPORT 147/* This function allocate the array which correspond to "buf". 148 Then this check multibyte string and mark on the positions which 149 are not singlebyte character nor the first byte of a multibyte 150 character. Caller must free the array. */ 151static char* 152check_multibyte_string(char const *buf, size_t size) 153{ 154 char *mb_properties = malloc(size); 155 mbstate_t cur_state; 156 size_t i; 157 memset(&cur_state, 0, sizeof(mbstate_t)); 158 memset(mb_properties, 0, sizeof(char)*size); 159 for (i = 0; i < size ;) 160 { 161 size_t mbclen; 162 mbclen = mbrlen(buf + i, size - i, &cur_state); 163 164 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) 165 { 166 /* An invalid sequence, or a truncated multibyte character. 167 We treat it as a singlebyte character. */ 168 mbclen = 1; 169 } 170 mb_properties[i] = mbclen; 171 i += mbclen; 172 } 173 174 return mb_properties; 175} 176#endif 177 178static void 179Gcompile (char const *pattern, size_t size) 180{ 181 const char *err; 182 char const *sep; 183 size_t total = size; 184 char const *motif = pattern; 185 186 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); 187 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); 188 189 /* For GNU regex compiler we have to pass the patterns separately to detect 190 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 191 GNU regex should have raise a syntax error. The same for backref, where 192 the backref should have been local to each pattern. */ 193 do 194 { 195 size_t len; 196 sep = memchr (motif, '\n', total); 197 if (sep) 198 { 199 len = sep - motif; 200 sep++; 201 total -= (len + 1); 202 } 203 else 204 { 205 len = total; 206 total = 0; 207 } 208 209 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); 210 if (patterns == NULL) 211 error (2, errno, _("memory exhausted")); 212 213 patterns[pcount] = patterns0; 214 215 if ((err = re_compile_pattern (motif, len, 216 &(patterns[pcount].regexbuf))) != 0) 217 error (2, 0, err); 218 pcount++; 219 220 motif = sep; 221 } while (sep && total != 0); 222 223 /* In the match_words and match_lines cases, we use a different pattern 224 for the DFA matcher that will quickly throw out cases that won't work. 225 Then if DFA succeeds we do some hairy stuff using the regex matcher 226 to decide whether the match should really count. */ 227 if (match_words || match_lines) 228 { 229 /* In the whole-word case, we use the pattern: 230 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\). 231 In the whole-line case, we use the pattern: 232 ^\(userpattern\)$. */ 233 234 static char const line_beg[] = "^\\("; 235 static char const line_end[] = "\\)$"; 236 static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; 237 static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; 238 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); 239 size_t i; 240 strcpy (n, match_lines ? line_beg : word_beg); 241 i = strlen (n); 242 memcpy (n + i, pattern, size); 243 i += size; 244 strcpy (n + i, match_lines ? line_end : word_end); 245 i += strlen (n + i); 246 pattern = n; 247 size = i; 248 } 249 250 dfacomp (pattern, size, &dfa, 1); 251 kwsmusts (); 252} 253 254static void 255Ecompile (char const *pattern, size_t size) 256{ 257 const char *err; 258 const char *sep; 259 size_t total = size; 260 char const *motif = pattern; 261 262 if (strcmp (matcher, "awk") == 0) 263 { 264 re_set_syntax (RE_SYNTAX_AWK); 265 dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); 266 } 267 else 268 { 269 re_set_syntax (RE_SYNTAX_POSIX_EGREP); 270 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); 271 } 272 273 /* For GNU regex compiler we have to pass the patterns separately to detect 274 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" 275 GNU regex should have raise a syntax error. The same for backref, where 276 the backref should have been local to each pattern. */ 277 do 278 { 279 size_t len; 280 sep = memchr (motif, '\n', total); 281 if (sep) 282 { 283 len = sep - motif; 284 sep++; 285 total -= (len + 1); 286 } 287 else 288 { 289 len = total; 290 total = 0; 291 } 292 293 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); 294 if (patterns == NULL) 295 error (2, errno, _("memory exhausted")); 296 patterns[pcount] = patterns0; 297 298 if ((err = re_compile_pattern (motif, len, 299 &(patterns[pcount].regexbuf))) != 0) 300 error (2, 0, err); 301 pcount++; 302 303 motif = sep; 304 } while (sep && total != 0); 305 306 /* In the match_words and match_lines cases, we use a different pattern 307 for the DFA matcher that will quickly throw out cases that won't work. 308 Then if DFA succeeds we do some hairy stuff using the regex matcher 309 to decide whether the match should really count. */ 310 if (match_words || match_lines) 311 { 312 /* In the whole-word case, we use the pattern: 313 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$). 314 In the whole-line case, we use the pattern: 315 ^(userpattern)$. */ 316 317 static char const line_beg[] = "^("; 318 static char const line_end[] = ")$"; 319 static char const word_beg[] = "(^|[^[:alnum:]_])("; 320 static char const word_end[] = ")([^[:alnum:]_]|$)"; 321 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); 322 size_t i; 323 strcpy (n, match_lines ? line_beg : word_beg); 324 i = strlen(n); 325 memcpy (n + i, pattern, size); 326 i += size; 327 strcpy (n + i, match_lines ? line_end : word_end); 328 i += strlen (n + i); 329 pattern = n; 330 size = i; 331 } 332 333 dfacomp (pattern, size, &dfa, 1); 334 kwsmusts (); 335} 336 337static size_t 338EGexecute (char const *buf, size_t size, size_t *match_size, int exact) 339{ 340 register char const *buflim, *beg, *end; 341 char eol = eolbyte; 342 int backref; 343 ptrdiff_t start, len; 344 struct kwsmatch kwsm; 345 size_t i; 346#ifdef MBS_SUPPORT 347 char *mb_properties = NULL; 348#endif /* MBS_SUPPORT */ 349 350#ifdef MBS_SUPPORT 351 if (MB_CUR_MAX > 1 && kwset) 352 mb_properties = check_multibyte_string(buf, size); 353#endif /* MBS_SUPPORT */ 354 355 buflim = buf + size; 356 357 for (beg = end = buf; end < buflim; beg = end) 358 { 359 if (!exact) 360 { 361 if (kwset) 362 { 363 /* Find a possible match using the KWset matcher. */ 364 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); 365 if (offset == (size_t) -1) 366 goto failure; 367 beg += offset; 368 /* Narrow down to the line containing the candidate, and 369 run it through DFA. */ 370 end = memchr(beg, eol, buflim - beg); 371 end++; 372#ifdef MBS_SUPPORT 373 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) 374 continue; 375#endif 376 while (beg > buf && beg[-1] != eol) 377 --beg; 378 if (kwsm.index < kwset_exact_matches) 379 goto success_in_beg_and_end; 380 if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) 381 continue; 382 } 383 else 384 { 385 /* No good fixed strings; start with DFA. */ 386 size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); 387 if (offset == (size_t) -1) 388 break; 389 /* Narrow down to the line we've found. */ 390 beg += offset; 391 end = memchr (beg, eol, buflim - beg); 392 end++; 393 while (beg > buf && beg[-1] != eol) 394 --beg; 395 } 396 /* Successful, no backreferences encountered! */ 397 if (!backref) 398 goto success_in_beg_and_end; 399 } 400 else 401 end = beg + size; 402 403 /* If we've made it to this point, this means DFA has seen 404 a probable match, and we need to run it through Regex. */ 405 for (i = 0; i < pcount; i++) 406 { 407 patterns[i].regexbuf.not_eol = 0; 408 if (0 <= (start = re_search (&(patterns[i].regexbuf), beg, 409 end - beg - 1, 0, 410 end - beg - 1, &(patterns[i].regs)))) 411 { 412 len = patterns[i].regs.end[0] - start; 413 if (exact && !match_words) 414 goto success_in_start_and_len; 415 if ((!match_lines && !match_words) 416 || (match_lines && len == end - beg - 1)) 417 goto success_in_beg_and_end; 418 /* If -w, check if the match aligns with word boundaries. 419 We do this iteratively because: 420 (a) the line may contain more than one occurence of the 421 pattern, and 422 (b) Several alternatives in the pattern might be valid at a 423 given point, and we may need to consider a shorter one to 424 find a word boundary. */ 425 if (match_words) 426 while (start >= 0) 427 { 428 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) 429 && (len == end - beg - 1 430 || !WCHAR ((unsigned char) beg[start + len]))) 431 goto success_in_start_and_len; 432 if (len > 0) 433 { 434 /* Try a shorter length anchored at the same place. */ 435 --len; 436 patterns[i].regexbuf.not_eol = 1; 437 len = re_match (&(patterns[i].regexbuf), beg, 438 start + len, start, 439 &(patterns[i].regs)); 440 } 441 if (len <= 0) 442 { 443 /* Try looking further on. */ 444 if (start == end - beg - 1) 445 break; 446 ++start; 447 patterns[i].regexbuf.not_eol = 0; 448 start = re_search (&(patterns[i].regexbuf), beg, 449 end - beg - 1, 450 start, end - beg - 1 - start, 451 &(patterns[i].regs)); 452 len = patterns[i].regs.end[0] - start; 453 } 454 } 455 } 456 } /* for Regex patterns. */ 457 } /* for (beg = end ..) */ 458 459 failure: 460#ifdef MBS_SUPPORT 461 if (MB_CUR_MAX > 1 && mb_properties) 462 free (mb_properties); 463#endif /* MBS_SUPPORT */ 464 return (size_t) -1; 465 466 success_in_beg_and_end: 467 len = end - beg; 468 start = beg - buf; 469 /* FALLTHROUGH */ 470 471 success_in_start_and_len: 472#ifdef MBS_SUPPORT 473 if (MB_CUR_MAX > 1 && mb_properties) 474 free (mb_properties); 475#endif /* MBS_SUPPORT */ 476 *match_size = len; 477 return start; 478} 479 480static void 481Fcompile (char const *pattern, size_t size) 482{ 483 char const *beg, *lim, *err; 484 485 kwsinit (); 486 beg = pattern; 487 do 488 { 489 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim) 490 ; 491 if ((err = kwsincr (kwset, beg, lim - beg)) != 0) 492 error (2, 0, err); 493 if (lim < pattern + size) 494 ++lim; 495 beg = lim; 496 } 497 while (beg < pattern + size); 498 499 if ((err = kwsprep (kwset)) != 0) 500 error (2, 0, err); 501} 502 503static size_t 504Fexecute (char const *buf, size_t size, size_t *match_size, int exact) 505{ 506 register char const *beg, *try, *end; 507 register size_t len; 508 char eol = eolbyte; 509 struct kwsmatch kwsmatch; 510#ifdef MBS_SUPPORT 511 char *mb_properties; 512 if (MB_CUR_MAX > 1) 513 mb_properties = check_multibyte_string (buf, size); 514#endif /* MBS_SUPPORT */ 515 516 for (beg = buf; beg <= buf + size; ++beg) 517 { 518 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); 519 if (offset == (size_t) -1) 520 goto failure; 521#ifdef MBS_SUPPORT 522 if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) 523 continue; /* It is a part of multibyte character. */ 524#endif /* MBS_SUPPORT */ 525 beg += offset; 526 len = kwsmatch.size[0]; 527 if (exact && !match_words) 528 goto success_in_beg_and_len; 529 if (match_lines) 530 { 531 if (beg > buf && beg[-1] != eol) 532 continue; 533 if (beg + len < buf + size && beg[len] != eol) 534 continue; 535 goto success; 536 } 537 else if (match_words) 538 { 539 while (offset >= 0) 540 { 541 if ((offset == 0 || !WCHAR ((unsigned char) beg[-1])) 542 && (len == end - beg - 1 || !WCHAR ((unsigned char) beg[len]))) 543 { 544 if (!exact) 545 /* Returns the whole line now we know there's a word match. */ 546 goto success; 547 else 548 /* Returns just this word match. */ 549 goto success_in_beg_and_len; 550 } 551 if (len > 0) 552 { 553 /* Try a shorter length anchored at the same place. */ 554 --len; 555 offset = kwsexec (kwset, beg, len, &kwsmatch); 556 if (offset == -1) { 557 break; /* Try a different anchor. */ 558 } 559 beg += offset; 560 len = kwsmatch.size[0]; 561 } 562 } 563 } 564 else 565 goto success; 566 } 567 568 failure: 569#ifdef MBS_SUPPORT 570 if (MB_CUR_MAX > 1) 571 free (mb_properties); 572#endif /* MBS_SUPPORT */ 573 return -1; 574 575 success: 576 end = memchr (beg + len, eol, (buf + size) - (beg + len)); 577 end++; 578 while (buf < beg && beg[-1] != eol) 579 --beg; 580 len = end - beg; 581 /* FALLTHROUGH */ 582 583 success_in_beg_and_len: 584 *match_size = len; 585#ifdef MBS_SUPPORT 586 if (MB_CUR_MAX > 1) 587 free (mb_properties); 588#endif /* MBS_SUPPORT */ 589 return beg - buf; 590} 591 592#if HAVE_LIBPCRE 593/* Compiled internal form of a Perl regular expression. */ 594static pcre *cre; 595 596/* Additional information about the pattern. */ 597static pcre_extra *extra; 598#endif 599 600static void 601Pcompile (char const *pattern, size_t size) 602{ 603#if !HAVE_LIBPCRE 604 error (2, 0, _("The -P option is not supported")); 605#else 606 int e; 607 char const *ep; 608 char *re = xmalloc (4 * size + 7); 609 int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); 610 char const *patlim = pattern + size; 611 char *n = re; 612 char const *p; 613 char const *pnul; 614 615 /* FIXME: Remove this restriction. */ 616 if (eolbyte != '\n') 617 error (2, 0, _("The -P and -z options cannot be combined")); 618 619 *n = '\0'; 620 if (match_lines) 621 strcpy (n, "^("); 622 if (match_words) 623 strcpy (n, "\\b("); 624 n += strlen (n); 625 626 /* The PCRE interface doesn't allow NUL bytes in the pattern, so 627 replace each NUL byte in the pattern with the four characters 628 "\000", removing a preceding backslash if there are an odd 629 number of backslashes before the NUL. 630 631 FIXME: This method does not work with some multibyte character 632 encodings, notably Shift-JIS, where a multibyte character can end 633 in a backslash byte. */ 634 for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1) 635 { 636 memcpy (n, p, pnul - p); 637 n += pnul - p; 638 for (p = pnul; pattern < p && p[-1] == '\\'; p--) 639 continue; 640 n -= (pnul - p) & 1; 641 strcpy (n, "\\000"); 642 n += 4; 643 } 644 645 memcpy (n, p, patlim - p); 646 n += patlim - p; 647 *n = '\0'; 648 if (match_words) 649 strcpy (n, ")\\b"); 650 if (match_lines) 651 strcpy (n, ")$"); 652 653 cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); 654 if (!cre) 655 error (2, 0, ep); 656 657 extra = pcre_study (cre, 0, &ep); 658 if (ep) 659 error (2, 0, ep); 660 661 free (re); 662#endif 663} 664 665static size_t 666Pexecute (char const *buf, size_t size, size_t *match_size, int exact) 667{ 668#if !HAVE_LIBPCRE 669 abort (); 670 return -1; 671#else 672 /* This array must have at least two elements; everything after that 673 is just for performance improvement in pcre_exec. */ 674 int sub[300]; 675 676 int e = pcre_exec (cre, extra, buf, size, 0, 0, 677 sub, sizeof sub / sizeof *sub); 678 679 if (e <= 0) 680 { 681 switch (e) 682 { 683 case PCRE_ERROR_NOMATCH: 684 return -1; 685 686 case PCRE_ERROR_NOMEMORY: 687 error (2, 0, _("Memory exhausted")); 688 689 default: 690 abort (); 691 } 692 } 693 else 694 { 695 /* Narrow down to the line we've found. */ 696 char const *beg = buf + sub[0]; 697 char const *end = buf + sub[1]; 698 char const *buflim = buf + size; 699 char eol = eolbyte; 700 if (!exact) 701 { 702 end = memchr (end, eol, buflim - end); 703 end++; 704 while (buf < beg && beg[-1] != eol) 705 --beg; 706 } 707 708 *match_size = end - beg; 709 return beg - buf; 710 } 711#endif 712} 713 714struct matcher const matchers[] = { 715 { "default", Gcompile, EGexecute }, 716 { "grep", Gcompile, EGexecute }, 717 { "egrep", Ecompile, EGexecute }, 718 { "awk", Ecompile, EGexecute }, 719 { "fgrep", Fcompile, Fexecute }, 720 { "perl", Pcompile, Pexecute }, 721 { "", 0, 0 }, 722}; 723