1/*	$NetBSD$	*/
2
3/* search.c - searching subroutines using dfa, kwset and regex for grep.
4   Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software
18   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19   02111-1307, USA.  */
20
21/* Written August 1992 by Mike Haertel. */
22
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26#include <sys/types.h>
27#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC && defined HAVE_WCTYPE
28/* We can handle multibyte string.  */
29# define MBS_SUPPORT
30# include <wchar.h>
31# include <wctype.h>
32#endif
33
34#include "system.h"
35#include "grep.h"
36#include "regex.h"
37#include "dfa.h"
38#include "kwset.h"
39#include "error.h"
40#include "xalloc.h"
41#ifdef HAVE_LIBPCRE
42# include <pcre.h>
43#endif
44
45#define NCHAR (UCHAR_MAX + 1)
46
47/* For -w, we also consider _ to be word constituent.  */
48#define WCHAR(C) (ISALNUM(C) || (C) == '_')
49
50/* DFA compiled regexp. */
51static struct dfa dfa;
52
53/* The Regex compiled patterns.  */
54static struct patterns
55{
56  /* Regex compiled regexp. */
57  struct re_pattern_buffer regexbuf;
58  struct re_registers regs; /* This is here on account of a BRAIN-DEAD
59			       Q@#%!# library interface in regex.c.  */
60} patterns0;
61
62struct patterns *patterns;
63size_t pcount;
64
65/* KWset compiled pattern.  For Ecompile and Gcompile, we compile
66   a list of strings, at least one of which is known to occur in
67   any string matching the regexp. */
68static kwset_t kwset;
69
70/* Number of compiled fixed strings known to exactly match the regexp.
71   If kwsexec returns < kwset_exact_matches, then we don't need to
72   call the regexp matcher at all. */
73static int kwset_exact_matches;
74
75#if defined(MBS_SUPPORT)
76static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
77#endif
78static void kwsinit PARAMS ((void));
79static void kwsmusts PARAMS ((void));
80static void Gcompile PARAMS ((char const *, size_t));
81static void Ecompile PARAMS ((char const *, size_t));
82static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
83static void Fcompile PARAMS ((char const *, size_t));
84static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
85static void Pcompile PARAMS ((char const *, size_t ));
86static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
87
88void
89dfaerror (char const *mesg)
90{
91  error (2, 0, mesg);
92}
93
94static void
95kwsinit (void)
96{
97  static char trans[NCHAR];
98  int i;
99
100  if (match_icase)
101    for (i = 0; i < NCHAR; ++i)
102      trans[i] = TOLOWER (i);
103
104  if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
105    error (2, 0, _("memory exhausted"));
106}
107
108/* If the DFA turns out to have some set of fixed strings one of
109   which must occur in the match, then we build a kwset matcher
110   to find those strings, and thus quickly filter out impossible
111   matches. */
112static void
113kwsmusts (void)
114{
115  struct dfamust const *dm;
116  char const *err;
117
118  if (dfa.musts)
119    {
120      kwsinit ();
121      /* First, we compile in the substrings known to be exact
122	 matches.  The kwset matcher will return the index
123	 of the matching string that it chooses. */
124      for (dm = dfa.musts; dm; dm = dm->next)
125	{
126	  if (!dm->exact)
127	    continue;
128	  ++kwset_exact_matches;
129	  if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
130	    error (2, 0, err);
131	}
132      /* Now, we compile the substrings that will require
133	 the use of the regexp matcher.  */
134      for (dm = dfa.musts; dm; dm = dm->next)
135	{
136	  if (dm->exact)
137	    continue;
138	  if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
139	    error (2, 0, err);
140	}
141      if ((err = kwsprep (kwset)) != 0)
142	error (2, 0, err);
143    }
144}
145
146#ifdef MBS_SUPPORT
147/* This function allocate the array which correspond to "buf".
148   Then this check multibyte string and mark on the positions which
149   are not singlebyte character nor the first byte of a multibyte
150   character.  Caller must free the array.  */
151static char*
152check_multibyte_string(char const *buf, size_t size)
153{
154  char *mb_properties = malloc(size);
155  mbstate_t cur_state;
156  size_t i;
157  memset(&cur_state, 0, sizeof(mbstate_t));
158  memset(mb_properties, 0, sizeof(char)*size);
159  for (i = 0; i < size ;)
160    {
161      size_t mbclen;
162      mbclen = mbrlen(buf + i, size - i, &cur_state);
163
164      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
165	{
166	  /* An invalid sequence, or a truncated multibyte character.
167	     We treat it as a singlebyte character.  */
168	  mbclen = 1;
169	}
170      mb_properties[i] = mbclen;
171      i += mbclen;
172    }
173
174  return mb_properties;
175}
176#endif
177
178static void
179Gcompile (char const *pattern, size_t size)
180{
181  const char *err;
182  char const *sep;
183  size_t total = size;
184  char const *motif = pattern;
185
186  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
187  dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
188
189  /* For GNU regex compiler we have to pass the patterns separately to detect
190     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
191     GNU regex should have raise a syntax error.  The same for backref, where
192     the backref should have been local to each pattern.  */
193  do
194    {
195      size_t len;
196      sep = memchr (motif, '\n', total);
197      if (sep)
198	{
199	  len = sep - motif;
200	  sep++;
201	  total -= (len + 1);
202	}
203      else
204	{
205	  len = total;
206	  total = 0;
207	}
208
209      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
210      if (patterns == NULL)
211	error (2, errno, _("memory exhausted"));
212
213      patterns[pcount] = patterns0;
214
215      if ((err = re_compile_pattern (motif, len,
216				    &(patterns[pcount].regexbuf))) != 0)
217	error (2, 0, err);
218      pcount++;
219
220      motif = sep;
221    } while (sep && total != 0);
222
223  /* In the match_words and match_lines cases, we use a different pattern
224     for the DFA matcher that will quickly throw out cases that won't work.
225     Then if DFA succeeds we do some hairy stuff using the regex matcher
226     to decide whether the match should really count. */
227  if (match_words || match_lines)
228    {
229      /* In the whole-word case, we use the pattern:
230	 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
231	 In the whole-line case, we use the pattern:
232	 ^\(userpattern\)$.  */
233
234      static char const line_beg[] = "^\\(";
235      static char const line_end[] = "\\)$";
236      static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
237      static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
238      char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
239      size_t i;
240      strcpy (n, match_lines ? line_beg : word_beg);
241      i = strlen (n);
242      memcpy (n + i, pattern, size);
243      i += size;
244      strcpy (n + i, match_lines ? line_end : word_end);
245      i += strlen (n + i);
246      pattern = n;
247      size = i;
248    }
249
250  dfacomp (pattern, size, &dfa, 1);
251  kwsmusts ();
252}
253
254static void
255Ecompile (char const *pattern, size_t size)
256{
257  const char *err;
258  const char *sep;
259  size_t total = size;
260  char const *motif = pattern;
261
262  if (strcmp (matcher, "awk") == 0)
263    {
264      re_set_syntax (RE_SYNTAX_AWK);
265      dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
266    }
267  else
268    {
269      re_set_syntax (RE_SYNTAX_POSIX_EGREP);
270      dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
271    }
272
273  /* For GNU regex compiler we have to pass the patterns separately to detect
274     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
275     GNU regex should have raise a syntax error.  The same for backref, where
276     the backref should have been local to each pattern.  */
277  do
278    {
279      size_t len;
280      sep = memchr (motif, '\n', total);
281      if (sep)
282	{
283	  len = sep - motif;
284	  sep++;
285	  total -= (len + 1);
286	}
287      else
288	{
289	  len = total;
290	  total = 0;
291	}
292
293      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
294      if (patterns == NULL)
295	error (2, errno, _("memory exhausted"));
296      patterns[pcount] = patterns0;
297
298      if ((err = re_compile_pattern (motif, len,
299				    &(patterns[pcount].regexbuf))) != 0)
300	error (2, 0, err);
301      pcount++;
302
303      motif = sep;
304    } while (sep && total != 0);
305
306  /* In the match_words and match_lines cases, we use a different pattern
307     for the DFA matcher that will quickly throw out cases that won't work.
308     Then if DFA succeeds we do some hairy stuff using the regex matcher
309     to decide whether the match should really count. */
310  if (match_words || match_lines)
311    {
312      /* In the whole-word case, we use the pattern:
313	 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
314	 In the whole-line case, we use the pattern:
315	 ^(userpattern)$.  */
316
317      static char const line_beg[] = "^(";
318      static char const line_end[] = ")$";
319      static char const word_beg[] = "(^|[^[:alnum:]_])(";
320      static char const word_end[] = ")([^[:alnum:]_]|$)";
321      char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
322      size_t i;
323      strcpy (n, match_lines ? line_beg : word_beg);
324      i = strlen(n);
325      memcpy (n + i, pattern, size);
326      i += size;
327      strcpy (n + i, match_lines ? line_end : word_end);
328      i += strlen (n + i);
329      pattern = n;
330      size = i;
331    }
332
333  dfacomp (pattern, size, &dfa, 1);
334  kwsmusts ();
335}
336
337static size_t
338EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
339{
340  register char const *buflim, *beg, *end;
341  char eol = eolbyte;
342  int backref;
343  ptrdiff_t start, len;
344  struct kwsmatch kwsm;
345  size_t i;
346#ifdef MBS_SUPPORT
347  char *mb_properties = NULL;
348#endif /* MBS_SUPPORT */
349
350#ifdef MBS_SUPPORT
351  if (MB_CUR_MAX > 1 && kwset)
352    mb_properties = check_multibyte_string(buf, size);
353#endif /* MBS_SUPPORT */
354
355  buflim = buf + size;
356
357  for (beg = end = buf; end < buflim; beg = end)
358    {
359      if (!exact)
360	{
361	  if (kwset)
362	    {
363	      /* Find a possible match using the KWset matcher. */
364	      size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
365	      if (offset == (size_t) -1)
366	        goto failure;
367	      beg += offset;
368	      /* Narrow down to the line containing the candidate, and
369		 run it through DFA. */
370	      end = memchr(beg, eol, buflim - beg);
371	      end++;
372#ifdef MBS_SUPPORT
373	      if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
374		continue;
375#endif
376	      while (beg > buf && beg[-1] != eol)
377		--beg;
378	      if (kwsm.index < kwset_exact_matches)
379		goto success_in_beg_and_end;
380	      if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
381		continue;
382	    }
383	  else
384	    {
385	      /* No good fixed strings; start with DFA. */
386	      size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
387	      if (offset == (size_t) -1)
388		break;
389	      /* Narrow down to the line we've found. */
390	      beg += offset;
391	      end = memchr (beg, eol, buflim - beg);
392	      end++;
393	      while (beg > buf && beg[-1] != eol)
394		--beg;
395	    }
396	  /* Successful, no backreferences encountered! */
397	  if (!backref)
398	    goto success_in_beg_and_end;
399	}
400      else
401	end = beg + size;
402
403      /* If we've made it to this point, this means DFA has seen
404	 a probable match, and we need to run it through Regex. */
405      for (i = 0; i < pcount; i++)
406	{
407	  patterns[i].regexbuf.not_eol = 0;
408	  if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
409				       end - beg - 1, 0,
410				       end - beg - 1, &(patterns[i].regs))))
411	    {
412	      len = patterns[i].regs.end[0] - start;
413	      if (exact && !match_words)
414	        goto success_in_start_and_len;
415	      if ((!match_lines && !match_words)
416		  || (match_lines && len == end - beg - 1))
417		goto success_in_beg_and_end;
418	      /* If -w, check if the match aligns with word boundaries.
419		 We do this iteratively because:
420		 (a) the line may contain more than one occurence of the
421		 pattern, and
422		 (b) Several alternatives in the pattern might be valid at a
423		 given point, and we may need to consider a shorter one to
424		 find a word boundary.  */
425	      if (match_words)
426		while (start >= 0)
427		  {
428		    if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
429			&& (len == end - beg - 1
430			    || !WCHAR ((unsigned char) beg[start + len])))
431		      goto success_in_start_and_len;
432		    if (len > 0)
433		      {
434			/* Try a shorter length anchored at the same place. */
435			--len;
436			patterns[i].regexbuf.not_eol = 1;
437			len = re_match (&(patterns[i].regexbuf), beg,
438					start + len, start,
439					&(patterns[i].regs));
440		      }
441		    if (len <= 0)
442		      {
443			/* Try looking further on. */
444			if (start == end - beg - 1)
445			  break;
446			++start;
447			patterns[i].regexbuf.not_eol = 0;
448			start = re_search (&(patterns[i].regexbuf), beg,
449					   end - beg - 1,
450					   start, end - beg - 1 - start,
451					   &(patterns[i].regs));
452			len = patterns[i].regs.end[0] - start;
453		      }
454		  }
455	    }
456	} /* for Regex patterns.  */
457    } /* for (beg = end ..) */
458
459 failure:
460#ifdef MBS_SUPPORT
461  if (MB_CUR_MAX > 1 && mb_properties)
462    free (mb_properties);
463#endif /* MBS_SUPPORT */
464  return (size_t) -1;
465
466 success_in_beg_and_end:
467  len = end - beg;
468  start = beg - buf;
469  /* FALLTHROUGH */
470
471 success_in_start_and_len:
472#ifdef MBS_SUPPORT
473  if (MB_CUR_MAX > 1 && mb_properties)
474    free (mb_properties);
475#endif /* MBS_SUPPORT */
476  *match_size = len;
477  return start;
478}
479
480static void
481Fcompile (char const *pattern, size_t size)
482{
483  char const *beg, *lim, *err;
484
485  kwsinit ();
486  beg = pattern;
487  do
488    {
489      for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
490	;
491      if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
492	error (2, 0, err);
493      if (lim < pattern + size)
494	++lim;
495      beg = lim;
496    }
497  while (beg < pattern + size);
498
499  if ((err = kwsprep (kwset)) != 0)
500    error (2, 0, err);
501}
502
503static size_t
504Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
505{
506  register char const *beg, *try, *end;
507  register size_t len;
508  char eol = eolbyte;
509  struct kwsmatch kwsmatch;
510#ifdef MBS_SUPPORT
511  char *mb_properties;
512  if (MB_CUR_MAX > 1)
513    mb_properties = check_multibyte_string (buf, size);
514#endif /* MBS_SUPPORT */
515
516  for (beg = buf; beg <= buf + size; ++beg)
517    {
518      size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
519      if (offset == (size_t) -1)
520	goto failure;
521#ifdef MBS_SUPPORT
522      if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
523	continue; /* It is a part of multibyte character.  */
524#endif /* MBS_SUPPORT */
525      beg += offset;
526      len = kwsmatch.size[0];
527      if (exact && !match_words)
528	goto success_in_beg_and_len;
529      if (match_lines)
530	{
531	  if (beg > buf && beg[-1] != eol)
532	    continue;
533	  if (beg + len < buf + size && beg[len] != eol)
534	    continue;
535	  goto success;
536	}
537      else if (match_words)
538	{
539	  while (offset >= 0)
540	    {
541	      if ((offset == 0 || !WCHAR ((unsigned char) beg[-1]))
542	          && (len == end - beg - 1 || !WCHAR ((unsigned char) beg[len])))
543	        {
544	          if (!exact)
545	            /* Returns the whole line now we know there's a word match. */
546	            goto success;
547	          else
548	            /* Returns just this word match. */
549	            goto success_in_beg_and_len;
550	        }
551	      if (len > 0)
552	        {
553	          /* Try a shorter length anchored at the same place. */
554	          --len;
555	          offset = kwsexec (kwset, beg, len, &kwsmatch);
556	          if (offset == -1) {
557	            break; /* Try a different anchor. */
558	          }
559	          beg += offset;
560	          len = kwsmatch.size[0];
561	        }
562	    }
563	}
564      else
565	goto success;
566    }
567
568 failure:
569#ifdef MBS_SUPPORT
570  if (MB_CUR_MAX > 1)
571    free (mb_properties);
572#endif /* MBS_SUPPORT */
573  return -1;
574
575 success:
576  end = memchr (beg + len, eol, (buf + size) - (beg + len));
577  end++;
578  while (buf < beg && beg[-1] != eol)
579    --beg;
580  len = end - beg;
581  /* FALLTHROUGH */
582
583 success_in_beg_and_len:
584  *match_size = len;
585#ifdef MBS_SUPPORT
586  if (MB_CUR_MAX > 1)
587    free (mb_properties);
588#endif /* MBS_SUPPORT */
589  return beg - buf;
590}
591
592#if HAVE_LIBPCRE
593/* Compiled internal form of a Perl regular expression.  */
594static pcre *cre;
595
596/* Additional information about the pattern.  */
597static pcre_extra *extra;
598#endif
599
600static void
601Pcompile (char const *pattern, size_t size)
602{
603#if !HAVE_LIBPCRE
604  error (2, 0, _("The -P option is not supported"));
605#else
606  int e;
607  char const *ep;
608  char *re = xmalloc (4 * size + 7);
609  int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
610  char const *patlim = pattern + size;
611  char *n = re;
612  char const *p;
613  char const *pnul;
614
615  /* FIXME: Remove this restriction.  */
616  if (eolbyte != '\n')
617    error (2, 0, _("The -P and -z options cannot be combined"));
618
619  *n = '\0';
620  if (match_lines)
621    strcpy (n, "^(");
622  if (match_words)
623    strcpy (n, "\\b(");
624  n += strlen (n);
625
626  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
627     replace each NUL byte in the pattern with the four characters
628     "\000", removing a preceding backslash if there are an odd
629     number of backslashes before the NUL.
630
631     FIXME: This method does not work with some multibyte character
632     encodings, notably Shift-JIS, where a multibyte character can end
633     in a backslash byte.  */
634  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
635    {
636      memcpy (n, p, pnul - p);
637      n += pnul - p;
638      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
639	continue;
640      n -= (pnul - p) & 1;
641      strcpy (n, "\\000");
642      n += 4;
643    }
644
645  memcpy (n, p, patlim - p);
646  n += patlim - p;
647  *n = '\0';
648  if (match_words)
649    strcpy (n, ")\\b");
650  if (match_lines)
651    strcpy (n, ")$");
652
653  cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
654  if (!cre)
655    error (2, 0, ep);
656
657  extra = pcre_study (cre, 0, &ep);
658  if (ep)
659    error (2, 0, ep);
660
661  free (re);
662#endif
663}
664
665static size_t
666Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
667{
668#if !HAVE_LIBPCRE
669  abort ();
670  return -1;
671#else
672  /* This array must have at least two elements; everything after that
673     is just for performance improvement in pcre_exec.  */
674  int sub[300];
675
676  int e = pcre_exec (cre, extra, buf, size, 0, 0,
677		     sub, sizeof sub / sizeof *sub);
678
679  if (e <= 0)
680    {
681      switch (e)
682	{
683	case PCRE_ERROR_NOMATCH:
684	  return -1;
685
686	case PCRE_ERROR_NOMEMORY:
687	  error (2, 0, _("Memory exhausted"));
688
689	default:
690	  abort ();
691	}
692    }
693  else
694    {
695      /* Narrow down to the line we've found.  */
696      char const *beg = buf + sub[0];
697      char const *end = buf + sub[1];
698      char const *buflim = buf + size;
699      char eol = eolbyte;
700      if (!exact)
701	{
702	  end = memchr (end, eol, buflim - end);
703	  end++;
704	  while (buf < beg && beg[-1] != eol)
705	    --beg;
706	}
707
708      *match_size = end - beg;
709      return beg - buf;
710    }
711#endif
712}
713
714struct matcher const matchers[] = {
715  { "default", Gcompile, EGexecute },
716  { "grep", Gcompile, EGexecute },
717  { "egrep", Ecompile, EGexecute },
718  { "awk", Ecompile, EGexecute },
719  { "fgrep", Fcompile, Fexecute },
720  { "perl", Pcompile, Pexecute },
721  { "", 0, 0 },
722};
723