1/* Pattern Matchers for Regular Expressions.
2   Copyright (C) 1992, 1998, 2000, 2005 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software Foundation,
16   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17
18#ifdef HAVE_CONFIG_H
19# include <config.h>
20#endif
21
22/* Specification.  */
23#include "libgrep.h"
24
25#include <ctype.h>
26#include <stdlib.h>
27#include <string.h>
28
29#include "error.h"
30#include "exitfail.h"
31#include "xalloc.h"
32#include "m-common.h"
33
34/* This must be included _after_ m-common.h: It depends on MBS_SUPPORT.  */
35#include "dfa.h"
36
37#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
38# define IN_CTYPE_DOMAIN(c) 1
39#else
40# define IN_CTYPE_DOMAIN(c) isascii(c)
41#endif
42#define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C))
43
44struct compiled_regex {
45  bool match_words;
46  bool match_lines;
47  char eolbyte;
48
49  /* DFA compiled regexp. */
50  struct dfa dfa;
51
52  /* The Regex compiled patterns.  */
53  struct patterns
54  {
55    /* Regex compiled regexp. */
56    struct re_pattern_buffer regexbuf;
57    struct re_registers regs; /* This is here on account of a BRAIN-DEAD
58				 Q@#%!# library interface in regex.c.  */
59  } *patterns;
60  size_t pcount;
61
62  /* KWset compiled pattern.  We compile a list of strings, at least one of
63     which is known to occur in any string matching the regexp. */
64  struct compiled_kwset ckwset;
65
66  /* Number of compiled fixed strings known to exactly match the regexp.
67     If kwsexec returns < kwset_exact_matches, then we don't need to
68     call the regexp matcher at all. */
69  int kwset_exact_matches;
70};
71
72/* Callback from dfa.c.  */
73void
74dfaerror (const char *mesg)
75{
76  error (exit_failure, 0, mesg);
77}
78
79/* If the DFA turns out to have some set of fixed strings one of
80   which must occur in the match, then we build a kwset matcher
81   to find those strings, and thus quickly filter out impossible
82   matches. */
83static void
84kwsmusts (struct compiled_regex *cregex,
85	  bool match_icase, bool match_words, bool match_lines, char eolbyte)
86{
87  struct dfamust const *dm;
88  const char *err;
89
90  if (cregex->dfa.musts)
91    {
92      kwsinit (&cregex->ckwset, match_icase, match_words, match_lines, eolbyte);
93      /* First, we compile in the substrings known to be exact
94	 matches.  The kwset matcher will return the index
95	 of the matching string that it chooses. */
96      for (dm = cregex->dfa.musts; dm; dm = dm->next)
97	{
98	  if (!dm->exact)
99	    continue;
100	  cregex->kwset_exact_matches++;
101	  if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL)
102	    error (exit_failure, 0, err);
103	}
104      /* Now, we compile the substrings that will require
105	 the use of the regexp matcher.  */
106      for (dm = cregex->dfa.musts; dm; dm = dm->next)
107	{
108	  if (dm->exact)
109	    continue;
110	  if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL)
111	    error (exit_failure, 0, err);
112	}
113      if ((err = kwsprep (cregex->ckwset.kwset)) != NULL)
114	error (exit_failure, 0, err);
115    }
116}
117
118static void *
119Gcompile (const char *pattern, size_t pattern_size,
120	  bool match_icase, bool match_words, bool match_lines, char eolbyte)
121{
122  struct compiled_regex *cregex;
123  const char *err;
124  const char *sep;
125  size_t total = pattern_size;
126  const char *motif = pattern;
127
128  cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex));
129  memset (cregex, '\0', sizeof (struct compiled_regex));
130  cregex->match_words = match_words;
131  cregex->match_lines = match_lines;
132  cregex->eolbyte = eolbyte;
133  cregex->patterns = NULL;
134  cregex->pcount = 0;
135
136  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
137  dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
138
139  /* For GNU regex compiler we have to pass the patterns separately to detect
140     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
141     GNU regex should have raise a syntax error.  The same for backref, where
142     the backref should have been local to each pattern.  */
143  do
144    {
145      size_t len;
146      sep = memchr (motif, '\n', total);
147      if (sep)
148	{
149	  len = sep - motif;
150	  sep++;
151	  total -= (len + 1);
152	}
153      else
154	{
155	  len = total;
156	  total = 0;
157	}
158
159      cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
160      memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
161
162      if ((err = re_compile_pattern (motif, len,
163				     &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
164	error (exit_failure, 0, err);
165      cregex->pcount++;
166
167      motif = sep;
168    } while (sep && total != 0);
169
170  /* In the match_words and match_lines cases, we use a different pattern
171     for the DFA matcher that will quickly throw out cases that won't work.
172     Then if DFA succeeds we do some hairy stuff using the regex matcher
173     to decide whether the match should really count. */
174  if (match_words || match_lines)
175    {
176      /* In the whole-word case, we use the pattern:
177	 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
178	 In the whole-line case, we use the pattern:
179	 ^\(userpattern\)$.  */
180
181      static const char line_beg[] = "^\\(";
182      static const char line_end[] = "\\)$";
183      static const char word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
184      static const char word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
185      char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end);
186      size_t i;
187      strcpy (n, match_lines ? line_beg : word_beg);
188      i = strlen (n);
189      memcpy (n + i, pattern, pattern_size);
190      i += pattern_size;
191      strcpy (n + i, match_lines ? line_end : word_end);
192      i += strlen (n + i);
193      pattern = n;
194      pattern_size = i;
195    }
196
197  dfacomp (pattern, pattern_size, &cregex->dfa, 1);
198  kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);
199
200  return cregex;
201}
202
203static void *
204compile (const char *pattern, size_t pattern_size,
205	 bool match_icase, bool match_words, bool match_lines, char eolbyte,
206	 reg_syntax_t syntax)
207{
208  struct compiled_regex *cregex;
209  const char *err;
210  const char *sep;
211  size_t total = pattern_size;
212  const char *motif = pattern;
213
214  cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex));
215  memset (cregex, '\0', sizeof (struct compiled_regex));
216  cregex->match_words = match_words;
217  cregex->match_lines = match_lines;
218  cregex->eolbyte = eolbyte;
219  cregex->patterns = NULL;
220  cregex->pcount = 0;
221
222  re_set_syntax (syntax);
223  dfasyntax (syntax, match_icase, eolbyte);
224
225  /* For GNU regex compiler we have to pass the patterns separately to detect
226     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
227     GNU regex should have raise a syntax error.  The same for backref, where
228     the backref should have been local to each pattern.  */
229  do
230    {
231      size_t len;
232      sep = memchr (motif, '\n', total);
233      if (sep)
234	{
235	  len = sep - motif;
236	  sep++;
237	  total -= (len + 1);
238	}
239      else
240	{
241	  len = total;
242	  total = 0;
243	}
244
245      cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
246      memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
247
248      if ((err = re_compile_pattern (motif, len,
249				     &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
250	error (exit_failure, 0, err);
251      cregex->pcount++;
252
253      motif = sep;
254    } while (sep && total != 0);
255
256  /* In the match_words and match_lines cases, we use a different pattern
257     for the DFA matcher that will quickly throw out cases that won't work.
258     Then if DFA succeeds we do some hairy stuff using the regex matcher
259     to decide whether the match should really count. */
260  if (match_words || match_lines)
261    {
262      /* In the whole-word case, we use the pattern:
263	 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
264	 In the whole-line case, we use the pattern:
265	 ^(userpattern)$.  */
266
267      static const char line_beg[] = "^(";
268      static const char line_end[] = ")$";
269      static const char word_beg[] = "(^|[^[:alnum:]_])(";
270      static const char word_end[] = ")([^[:alnum:]_]|$)";
271      char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end);
272      size_t i;
273      strcpy (n, match_lines ? line_beg : word_beg);
274      i = strlen(n);
275      memcpy (n + i, pattern, pattern_size);
276      i += pattern_size;
277      strcpy (n + i, match_lines ? line_end : word_end);
278      i += strlen (n + i);
279      pattern = n;
280      pattern_size = i;
281    }
282
283  dfacomp (pattern, pattern_size, &cregex->dfa, 1);
284  kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);
285
286  return cregex;
287}
288
289static void *
290Ecompile (const char *pattern, size_t pattern_size,
291	  bool match_icase, bool match_words, bool match_lines, char eolbyte)
292{
293  return compile (pattern, pattern_size,
294		  match_icase, match_words, match_lines, eolbyte,
295		  RE_SYNTAX_POSIX_EGREP);
296}
297
298static void *
299AWKcompile (const char *pattern, size_t pattern_size,
300	    bool match_icase, bool match_words, bool match_lines, char eolbyte)
301{
302  return compile (pattern, pattern_size,
303		  match_icase, match_words, match_lines, eolbyte,
304		  RE_SYNTAX_AWK);
305}
306
307static size_t
308EGexecute (const void *compiled_pattern,
309	   const char *buf, size_t buf_size,
310	   size_t *match_size, bool exact)
311{
312  struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
313  register const char *buflim, *beg, *end;
314  char eol = cregex->eolbyte;
315  int backref, start, len;
316  struct kwsmatch kwsm;
317  size_t i;
318#ifdef MBS_SUPPORT
319  char *mb_properties = NULL;
320#endif /* MBS_SUPPORT */
321
322#ifdef MBS_SUPPORT
323  if (MB_CUR_MAX > 1 && cregex->ckwset.kwset)
324    mb_properties = check_multibyte_string (buf, buf_size);
325#endif /* MBS_SUPPORT */
326
327  buflim = buf + buf_size;
328
329  for (beg = end = buf; end < buflim; beg = end)
330    {
331      if (!exact)
332	{
333	  if (cregex->ckwset.kwset)
334	    {
335	      /* Find a possible match using the KWset matcher. */
336	      size_t offset = kwsexec (cregex->ckwset.kwset, beg, buflim - beg, &kwsm);
337	      if (offset == (size_t) -1)
338		{
339#ifdef MBS_SUPPORT
340		  if (MB_CUR_MAX > 1)
341		    free (mb_properties);
342#endif
343		  return (size_t)-1;
344		}
345	      beg += offset;
346	      /* Narrow down to the line containing the candidate, and
347		 run it through DFA. */
348	      end = memchr (beg, eol, buflim - beg);
349	      if (end != NULL)
350		end++;
351	      else
352		end = buflim;
353#ifdef MBS_SUPPORT
354	      if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
355		continue;
356#endif
357	      while (beg > buf && beg[-1] != eol)
358		--beg;
359	      if (kwsm.index < cregex->kwset_exact_matches)
360		goto success;
361	      if (dfaexec (&cregex->dfa, beg, end - beg, &backref) == (size_t) -1)
362		continue;
363	    }
364	  else
365	    {
366	      /* No good fixed strings; start with DFA. */
367	      size_t offset = dfaexec (&cregex->dfa, beg, buflim - beg, &backref);
368	      if (offset == (size_t) -1)
369		break;
370	      /* Narrow down to the line we've found. */
371	      beg += offset;
372	      end = memchr (beg, eol, buflim - beg);
373	      if (end != NULL)
374		end++;
375	      else
376		end = buflim;
377	      while (beg > buf && beg[-1] != eol)
378		--beg;
379	    }
380	  /* Successful, no backreferences encountered! */
381	  if (!backref)
382	    goto success;
383	}
384      else
385	end = beg + buf_size;
386
387      /* If we've made it to this point, this means DFA has seen
388	 a probable match, and we need to run it through Regex. */
389      for (i = 0; i < cregex->pcount; i++)
390	{
391	  cregex->patterns[i].regexbuf.not_eol = 0;
392	  if (0 <= (start = re_search (&(cregex->patterns[i].regexbuf), beg,
393				       end - beg - 1, 0,
394				       end - beg - 1, &(cregex->patterns[i].regs))))
395	    {
396	      len = cregex->patterns[i].regs.end[0] - start;
397	      if (exact)
398		{
399		  *match_size = len;
400		  return start;
401		}
402	      if ((!cregex->match_lines && !cregex->match_words)
403		  || (cregex->match_lines && len == end - beg - 1))
404		goto success;
405	      /* If -w, check if the match aligns with word boundaries.
406		 We do this iteratively because:
407		 (a) the line may contain more than one occurence of the
408		 pattern, and
409		 (b) Several alternatives in the pattern might be valid at a
410		 given point, and we may need to consider a shorter one to
411		 find a word boundary.  */
412	      if (cregex->match_words)
413		while (start >= 0)
414		  {
415		    if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1]))
416			&& (len == end - beg - 1
417			    || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len])))
418		      goto success;
419		    if (len > 0)
420		      {
421			/* Try a shorter length anchored at the same place. */
422			--len;
423			cregex->patterns[i].regexbuf.not_eol = 1;
424			len = re_match (&(cregex->patterns[i].regexbuf), beg,
425					start + len, start,
426					&(cregex->patterns[i].regs));
427		      }
428		    if (len <= 0)
429		      {
430			/* Try looking further on. */
431			if (start == end - beg - 1)
432			  break;
433			++start;
434			cregex->patterns[i].regexbuf.not_eol = 0;
435			start = re_search (&(cregex->patterns[i].regexbuf), beg,
436					   end - beg - 1,
437					   start, end - beg - 1 - start,
438					   &(cregex->patterns[i].regs));
439			len = cregex->patterns[i].regs.end[0] - start;
440		      }
441		  }
442	    }
443	} /* for Regex patterns.  */
444    } /* for (beg = end ..) */
445#ifdef MBS_SUPPORT
446  if (MB_CUR_MAX > 1 && mb_properties)
447    free (mb_properties);
448#endif /* MBS_SUPPORT */
449  return (size_t) -1;
450
451 success:
452#ifdef MBS_SUPPORT
453  if (MB_CUR_MAX > 1 && mb_properties)
454    free (mb_properties);
455#endif /* MBS_SUPPORT */
456  *match_size = end - beg;
457  return beg - buf;
458}
459
460static void
461EGfree (void *compiled_pattern)
462{
463  struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
464
465  dfafree (&cregex->dfa);
466  free (cregex->patterns);
467  free (cregex->ckwset.trans);
468  free (cregex);
469}
470
471/* POSIX Basic Regular Expressions */
472matcher_t matcher_grep =
473  {
474    Gcompile,
475    EGexecute,
476    EGfree
477  };
478
479/* POSIX Extended Regular Expressions */
480matcher_t matcher_egrep =
481  {
482    Ecompile,
483    EGexecute,
484    EGfree
485  };
486
487/* AWK Regular Expressions */
488matcher_t matcher_awk =
489  {
490    AWKcompile,
491    EGexecute,
492    EGfree
493  };
494