Deleted Added
sdiff udiff text old ( 126435 ) new ( 131557 )
full compact
1/* search.c - searching subroutines using dfa, kwset and regex for grep.
2 Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8

--- 4 unchanged lines hidden (view full) ---

13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 02111-1307, USA. */
18
19/* Written August 1992 by Mike Haertel. */
20
21/* $FreeBSD: head/gnu/usr.bin/grep/search.c 131557 2004-07-04 10:02:03Z tjr $ */
22
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26#include <sys/types.h>
27#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
28/* We can handle multibyte string. */
29# define MBS_SUPPORT
30# include <wchar.h>
31# include <wctype.h>
32#endif
33
34#include "system.h"
35#include "grep.h"
36#include "regex.h"
37#include "dfa.h"
38#include "kwset.h"
39#include "error.h"
40#include "xalloc.h"
41#ifdef HAVE_LIBPCRE
42# include <pcre.h>
43#endif
44
45#define NCHAR (UCHAR_MAX + 1)
46
47/* For -w, we also consider _ to be word constituent. */
48#define WCHAR(C) (ISALNUM(C) || (C) == '_')
49
50/* DFA compiled regexp. */
51static struct dfa dfa;
52
53/* The Regex compiled patterns. */
54static struct patterns
55{
56 /* Regex compiled regexp. */
57 struct re_pattern_buffer regexbuf;
58 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
59 Q@#%!# library interface in regex.c. */
60} patterns0;
61
62struct patterns *patterns;
63size_t pcount;
64
65/* KWset compiled pattern. For Ecompile and Gcompile, we compile
66 a list of strings, at least one of which is known to occur in
67 any string matching the regexp. */
68static kwset_t kwset;
69
70/* Number of compiled fixed strings known to exactly match the regexp.
71 If kwsexec returns < kwset_exact_matches, then we don't need to
72 call the regexp matcher at all. */
73static int kwset_exact_matches;
74
75#if defined(MBS_SUPPORT)
76static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
77#endif
78static void kwsinit PARAMS ((void));
79static void kwsmusts PARAMS ((void));
80static void Gcompile PARAMS ((char const *, size_t));
81static void Ecompile PARAMS ((char const *, size_t));
82static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
83static void Fcompile PARAMS ((char const *, size_t));
84static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
85static void Pcompile PARAMS ((char const *, size_t ));
86static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
87
88void
89dfaerror (char const *mesg)
90{
91 error (2, 0, mesg);
92}
93
94static void
95kwsinit (void)
96{
97 static char trans[NCHAR];
98 int i;
99
100 if (match_icase)
101 for (i = 0; i < NCHAR; ++i)
102 trans[i] = TOLOWER (i);
103
104 if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
105 error (2, 0, _("memory exhausted"));
106}
107
108/* If the DFA turns out to have some set of fixed strings one of
109 which must occur in the match, then we build a kwset matcher
110 to find those strings, and thus quickly filter out impossible
111 matches. */
112static void
113kwsmusts (void)
114{
115 struct dfamust const *dm;
116 char const *err;
117
118 if (dfa.musts)
119 {
120 kwsinit ();
121 /* First, we compile in the substrings known to be exact
122 matches. The kwset matcher will return the index
123 of the matching string that it chooses. */
124 for (dm = dfa.musts; dm; dm = dm->next)
125 {
126 if (!dm->exact)
127 continue;
128 ++kwset_exact_matches;
129 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
130 error (2, 0, err);
131 }
132 /* Now, we compile the substrings that will require
133 the use of the regexp matcher. */
134 for (dm = dfa.musts; dm; dm = dm->next)
135 {
136 if (dm->exact)
137 continue;
138 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
139 error (2, 0, err);
140 }
141 if ((err = kwsprep (kwset)) != 0)
142 error (2, 0, err);
143 }
144}
145
146#ifdef MBS_SUPPORT
147/* This function allocate the array which correspond to "buf".
148 Then this check multibyte string and mark on the positions which
149 are not singlebyte character nor the first byte of a multibyte
150 character. Caller must free the array. */
151static char*
152check_multibyte_string(char const *buf, size_t size)
153{
154 char *mb_properties = malloc(size);
155 mbstate_t cur_state;
156 int i;
157 memset(&cur_state, 0, sizeof(mbstate_t));
158 memset(mb_properties, 0, sizeof(char)*size);
159 for (i = 0; i < size ;)
160 {
161 size_t mbclen;
162 mbclen = mbrlen(buf + i, size - i, &cur_state);
163
164 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
165 {
166 /* An invalid sequence, or a truncated multibyte character.
167 We treat it as a singlebyte character. */
168 mbclen = 1;
169 }
170 mb_properties[i] = mbclen;
171 i += mbclen;
172 }
173
174 return mb_properties;
175}
176#endif
177
178static void
179Gcompile (char const *pattern, size_t size)
180{
181 const char *err;
182 char const *sep;
183 size_t total = size;
184 char const *motif = pattern;
185
186 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
187 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
188
189 /* For GNU regex compiler we have to pass the patterns separately to detect
190 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
191 GNU regex should have raise a syntax error. The same for backref, where
192 the backref should have been local to each pattern. */
193 do
194 {
195 size_t len;
196 sep = memchr (motif, '\n', total);
197 if (sep)
198 {
199 len = sep - motif;
200 sep++;
201 total -= (len + 1);
202 }
203 else
204 {
205 len = total;
206 total = 0;
207 }
208
209 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
210 if (patterns == NULL)
211 error (2, errno, _("memory exhausted"));
212
213 patterns[pcount] = patterns0;
214
215 if ((err = re_compile_pattern (motif, len,
216 &(patterns[pcount].regexbuf))) != 0)
217 error (2, 0, err);
218 pcount++;
219
220 motif = sep;
221 } while (sep && total != 0);
222
223 /* In the match_words and match_lines cases, we use a different pattern
224 for the DFA matcher that will quickly throw out cases that won't work.
225 Then if DFA succeeds we do some hairy stuff using the regex matcher
226 to decide whether the match should really count. */
227 if (match_words || match_lines)
228 {
229 /* In the whole-word case, we use the pattern:
230 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
231 In the whole-line case, we use the pattern:
232 ^\(userpattern\)$. */
233
234 static char const line_beg[] = "^\\(";
235 static char const line_end[] = "\\)$";
236 static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
237 static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
238 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
239 size_t i;
240 strcpy (n, match_lines ? line_beg : word_beg);
241 i = strlen (n);
242 memcpy (n + i, pattern, size);
243 i += size;
244 strcpy (n + i, match_lines ? line_end : word_end);
245 i += strlen (n + i);
246 pattern = n;
247 size = i;
248 }
249
250 dfacomp (pattern, size, &dfa, 1);
251 kwsmusts ();
252}
253
254static void
255Ecompile (char const *pattern, size_t size)
256{
257 const char *err;
258 const char *sep;
259 size_t total = size;
260 char const *motif = pattern;
261
262 if (strcmp (matcher, "awk") == 0)
263 {
264 re_set_syntax (RE_SYNTAX_AWK);
265 dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
266 }
267 else
268 {
269 re_set_syntax (RE_SYNTAX_POSIX_EGREP);
270 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
271 }
272
273 /* For GNU regex compiler we have to pass the patterns separately to detect
274 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
275 GNU regex should have raise a syntax error. The same for backref, where
276 the backref should have been local to each pattern. */
277 do
278 {
279 size_t len;
280 sep = memchr (motif, '\n', total);
281 if (sep)
282 {
283 len = sep - motif;
284 sep++;
285 total -= (len + 1);
286 }
287 else
288 {
289 len = total;
290 total = 0;
291 }
292
293 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
294 if (patterns == NULL)
295 error (2, errno, _("memory exhausted"));
296 patterns[pcount] = patterns0;
297
298 if ((err = re_compile_pattern (motif, len,
299 &(patterns[pcount].regexbuf))) != 0)
300 error (2, 0, err);
301 pcount++;
302
303 motif = sep;
304 } while (sep && total != 0);
305
306 /* In the match_words and match_lines cases, we use a different pattern
307 for the DFA matcher that will quickly throw out cases that won't work.
308 Then if DFA succeeds we do some hairy stuff using the regex matcher
309 to decide whether the match should really count. */
310 if (match_words || match_lines)
311 {
312 /* In the whole-word case, we use the pattern:
313 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
314 In the whole-line case, we use the pattern:
315 ^(userpattern)$. */
316
317 static char const line_beg[] = "^(";
318 static char const line_end[] = ")$";
319 static char const word_beg[] = "(^|[^[:alnum:]_])(";
320 static char const word_end[] = ")([^[:alnum:]_]|$)";
321 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
322 size_t i;
323 strcpy (n, match_lines ? line_beg : word_beg);
324 i = strlen(n);
325 memcpy (n + i, pattern, size);
326 i += size;
327 strcpy (n + i, match_lines ? line_end : word_end);
328 i += strlen (n + i);
329 pattern = n;
330 size = i;
331 }
332
333 dfacomp (pattern, size, &dfa, 1);
334 kwsmusts ();
335}
336
337static size_t
338EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
339{
340 register char const *buflim, *beg, *end;
341 char eol = eolbyte;
342 int backref, start, len;
343 struct kwsmatch kwsm;
344 size_t i;
345#ifdef MBS_SUPPORT
346 char *mb_properties = NULL;
347#endif /* MBS_SUPPORT */
348
349#ifdef MBS_SUPPORT
350 if (MB_CUR_MAX > 1 && kwset)
351 mb_properties = check_multibyte_string(buf, size);
352#endif /* MBS_SUPPORT */
353
354 buflim = buf + size;
355
356 for (beg = end = buf; end < buflim; beg = end)
357 {
358 if (!exact)
359 {
360 if (kwset)
361 {
362 /* Find a possible match using the KWset matcher. */
363 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
364 if (offset == (size_t) -1)
365 {
366#ifdef MBS_SUPPORT
367 if (MB_CUR_MAX > 1)
368 free(mb_properties);
369#endif
370 return (size_t)-1;
371 }
372 beg += offset;
373 /* Narrow down to the line containing the candidate, and
374 run it through DFA. */
375 end = memchr(beg, eol, buflim - beg);
376 end++;
377#ifdef MBS_SUPPORT
378 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
379 continue;
380#endif
381 while (beg > buf && beg[-1] != eol)
382 --beg;
383 if (kwsm.index < kwset_exact_matches)
384 goto success;
385 if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
386 continue;
387 }
388 else
389 {
390 /* No good fixed strings; start with DFA. */
391 size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
392 if (offset == (size_t) -1)
393 break;
394 /* Narrow down to the line we've found. */
395 beg += offset;
396 end = memchr (beg, eol, buflim - beg);
397 end++;
398 while (beg > buf && beg[-1] != eol)
399 --beg;
400 }
401 /* Successful, no backreferences encountered! */
402 if (!backref)
403 goto success;
404 }
405 else
406 end = beg + size;
407
408 /* If we've made it to this point, this means DFA has seen
409 a probable match, and we need to run it through Regex. */
410 for (i = 0; i < pcount; i++)
411 {
412 patterns[i].regexbuf.not_eol = 0;
413 if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
414 end - beg - 1, 0,
415 end - beg - 1, &(patterns[i].regs))))
416 {
417 len = patterns[i].regs.end[0] - start;
418 if (exact)
419 {
420 *match_size = len;
421 return start;
422 }
423 if ((!match_lines && !match_words)
424 || (match_lines && len == end - beg - 1))
425 goto success;
426 /* If -w, check if the match aligns with word boundaries.
427 We do this iteratively because:
428 (a) the line may contain more than one occurence of the
429 pattern, and
430 (b) Several alternatives in the pattern might be valid at a
431 given point, and we may need to consider a shorter one to
432 find a word boundary. */
433 if (match_words)
434 while (start >= 0)
435 {
436 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
437 && (len == end - beg - 1
438 || !WCHAR ((unsigned char) beg[start + len])))
439 goto success;
440 if (len > 0)
441 {
442 /* Try a shorter length anchored at the same place. */
443 --len;
444 patterns[i].regexbuf.not_eol = 1;
445 len = re_match (&(patterns[i].regexbuf), beg,
446 start + len, start,
447 &(patterns[i].regs));
448 }
449 if (len <= 0)
450 {
451 /* Try looking further on. */
452 if (start == end - beg - 1)
453 break;
454 ++start;
455 patterns[i].regexbuf.not_eol = 0;
456 start = re_search (&(patterns[i].regexbuf), beg,
457 end - beg - 1,
458 start, end - beg - 1 - start,
459 &(patterns[i].regs));
460 len = patterns[i].regs.end[0] - start;
461 }
462 }
463 }
464 } /* for Regex patterns. */
465 } /* for (beg = end ..) */
466#ifdef MBS_SUPPORT
467 if (MB_CUR_MAX > 1 && mb_properties)
468 free (mb_properties);
469#endif /* MBS_SUPPORT */
470 return (size_t) -1;
471
472 success:
473#ifdef MBS_SUPPORT
474 if (MB_CUR_MAX > 1 && mb_properties)
475 free (mb_properties);
476#endif /* MBS_SUPPORT */
477 *match_size = end - beg;
478 return beg - buf;
479}
480
481static void
482Fcompile (char const *pattern, size_t size)
483{
484 char const *beg, *lim, *err;
485
486 kwsinit ();
487 beg = pattern;
488 do
489 {
490 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
491 ;
492 if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
493 error (2, 0, err);
494 if (lim < pattern + size)
495 ++lim;
496 beg = lim;
497 }
498 while (beg < pattern + size);
499
500 if ((err = kwsprep (kwset)) != 0)
501 error (2, 0, err);
502}
503
504static size_t
505Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
506{
507 register char const *beg, *try, *end;
508 register size_t len;
509 char eol = eolbyte;
510 struct kwsmatch kwsmatch;
511#ifdef MBS_SUPPORT
512 char *mb_properties;
513 if (MB_CUR_MAX > 1)
514 mb_properties = check_multibyte_string (buf, size);
515#endif /* MBS_SUPPORT */
516
517 for (beg = buf; beg <= buf + size; ++beg)
518 {
519 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
520 if (offset == (size_t) -1)
521 {
522#ifdef MBS_SUPPORT
523 if (MB_CUR_MAX > 1)
524 free(mb_properties);
525#endif /* MBS_SUPPORT */
526 return offset;
527 }
528#ifdef MBS_SUPPORT
529 if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
530 continue; /* It is a part of multibyte character. */
531#endif /* MBS_SUPPORT */
532 beg += offset;
533 len = kwsmatch.size[0];
534 if (exact)
535 {
536 *match_size = len;
537#ifdef MBS_SUPPORT
538 if (MB_CUR_MAX > 1)
539 free (mb_properties);
540#endif /* MBS_SUPPORT */
541 return beg - buf;
542 }
543 if (match_lines)
544 {
545 if (beg > buf && beg[-1] != eol)
546 continue;
547 if (beg + len < buf + size && beg[len] != eol)
548 continue;
549 goto success;
550 }
551 else if (match_words)
552 for (try = beg; len; )
553 {
554 if (try > buf && WCHAR((unsigned char) try[-1]))
555 break;
556 if (try + len < buf + size && WCHAR((unsigned char) try[len]))
557 {
558 offset = kwsexec (kwset, beg, --len, &kwsmatch);
559 if (offset == (size_t) -1)
560 {
561#ifdef MBS_SUPPORT
562 if (MB_CUR_MAX > 1)
563 free (mb_properties);
564#endif /* MBS_SUPPORT */
565 return offset;
566 }
567 try = beg + offset;
568 len = kwsmatch.size[0];
569 }
570 else
571 goto success;
572 }
573 else
574 goto success;
575 }
576
577#ifdef MBS_SUPPORT
578 if (MB_CUR_MAX > 1)
579 free (mb_properties);
580#endif /* MBS_SUPPORT */
581 return -1;
582
583 success:
584 end = memchr (beg + len, eol, (buf + size) - (beg + len));
585 end++;
586 while (buf < beg && beg[-1] != eol)
587 --beg;
588 *match_size = end - beg;
589#ifdef MBS_SUPPORT
590 if (MB_CUR_MAX > 1)
591 free (mb_properties);
592#endif /* MBS_SUPPORT */
593 return beg - buf;
594}
595
596#if HAVE_LIBPCRE
597/* Compiled internal form of a Perl regular expression. */
598static pcre *cre;
599
600/* Additional information about the pattern. */
601static pcre_extra *extra;
602#endif
603
604static void
605Pcompile (char const *pattern, size_t size)
606{
607#if !HAVE_LIBPCRE
608 error (2, 0, _("The -P option is not supported"));
609#else
610 int e;
611 char const *ep;
612 char *re = xmalloc (4 * size + 7);
613 int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
614 char const *patlim = pattern + size;
615 char *n = re;
616 char const *p;
617 char const *pnul;
618
619 /* FIXME: Remove this restriction. */
620 if (eolbyte != '\n')
621 error (2, 0, _("The -P and -z options cannot be combined"));
622
623 *n = '\0';
624 if (match_lines)
625 strcpy (n, "^(");
626 if (match_words)
627 strcpy (n, "\\b(");
628 n += strlen (n);
629
630 /* The PCRE interface doesn't allow NUL bytes in the pattern, so
631 replace each NUL byte in the pattern with the four characters
632 "\000", removing a preceding backslash if there are an odd
633 number of backslashes before the NUL.
634
635 FIXME: This method does not work with some multibyte character
636 encodings, notably Shift-JIS, where a multibyte character can end
637 in a backslash byte. */
638 for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
639 {
640 memcpy (n, p, pnul - p);
641 n += pnul - p;
642 for (p = pnul; pattern < p && p[-1] == '\\'; p--)
643 continue;
644 n -= (pnul - p) & 1;
645 strcpy (n, "\\000");
646 n += 4;
647 }
648
649 memcpy (n, p, patlim - p);
650 n += patlim - p;
651 *n = '\0';
652 if (match_words)
653 strcpy (n, ")\\b");
654 if (match_lines)
655 strcpy (n, ")$");
656
657 cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
658 if (!cre)
659 error (2, 0, ep);
660
661 extra = pcre_study (cre, 0, &ep);
662 if (ep)
663 error (2, 0, ep);
664
665 free (re);
666#endif
667}
668
669static size_t
670Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
671{
672#if !HAVE_LIBPCRE
673 abort ();
674 return -1;
675#else
676 /* This array must have at least two elements; everything after that
677 is just for performance improvement in pcre_exec. */
678 int sub[300];
679
680 int e = pcre_exec (cre, extra, buf, size, 0, 0,
681 sub, sizeof sub / sizeof *sub);
682
683 if (e <= 0)
684 {
685 switch (e)
686 {
687 case PCRE_ERROR_NOMATCH:
688 return -1;
689
690 case PCRE_ERROR_NOMEMORY:
691 error (2, 0, _("Memory exhausted"));
692
693 default:
694 abort ();
695 }
696 }
697 else
698 {
699 /* Narrow down to the line we've found. */
700 char const *beg = buf + sub[0];
701 char const *end = buf + sub[1];
702 char const *buflim = buf + size;
703 char eol = eolbyte;
704 if (!exact)
705 {
706 end = memchr (end, eol, buflim - end);
707 end++;
708 while (buf < beg && beg[-1] != eol)
709 --beg;
710 }
711
712 *match_size = end - beg;
713 return beg - buf;
714 }
715#endif
716}
717
718struct matcher const matchers[] = {
719 { "default", Gcompile, EGexecute },
720 { "grep", Gcompile, EGexecute },
721 { "egrep", Ecompile, EGexecute },
722 { "awk", Ecompile, EGexecute },
723 { "fgrep", Fcompile, Fexecute },
724 { "perl", Pcompile, Pexecute },
725 { "", 0, 0 },
726};