• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-R7000-V1.0.7.12_1.2.5/ap/gpl/timemachine/gettext-0.17/gettext-tools/libgrep/
1/* Pattern Matchers for Regular Expressions.
2   Copyright (C) 1992, 1998, 2000, 2005-2006 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
16
17#ifdef HAVE_CONFIG_H
18# include <config.h>
19#endif
20
21/* Specification.  */
22#include "libgrep.h"
23
24#include <ctype.h>
25#include <stdlib.h>
26#include <string.h>
27
28#include "error.h"
29#include "exitfail.h"
30#include "xalloc.h"
31#include "m-common.h"
32
33/* This must be included _after_ m-common.h: It depends on MBS_SUPPORT.  */
34#include "dfa.h"
35
36#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
37# define IN_CTYPE_DOMAIN(c) 1
38#else
39# define IN_CTYPE_DOMAIN(c) isascii(c)
40#endif
41#define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C))
42
43struct patterns
44{
45  /* Regex compiled regexp. */
46  struct re_pattern_buffer regexbuf;
47  struct re_registers regs; /* This is here on account of a BRAIN-DEAD
48			       Q@#%!# library interface in regex.c.  */
49};
50
51struct compiled_regex {
52  bool match_words;
53  bool match_lines;
54  char eolbyte;
55
56  /* DFA compiled regexp. */
57  struct dfa dfa;
58
59  /* The Regex compiled patterns.  */
60  struct patterns *patterns;
61  size_t pcount;
62
63  /* KWset compiled pattern.  We compile a list of strings, at least one of
64     which is known to occur in any string matching the regexp. */
65  struct compiled_kwset ckwset;
66
67  /* Number of compiled fixed strings known to exactly match the regexp.
68     If kwsexec returns < kwset_exact_matches, then we don't need to
69     call the regexp matcher at all. */
70  int kwset_exact_matches;
71};
72
73/* Callback from dfa.c.  */
74void
75dfaerror (const char *mesg)
76{
77  error (exit_failure, 0, mesg);
78}
79
80/* If the DFA turns out to have some set of fixed strings one of
81   which must occur in the match, then we build a kwset matcher
82   to find those strings, and thus quickly filter out impossible
83   matches. */
84static void
85kwsmusts (struct compiled_regex *cregex,
86	  bool match_icase, bool match_words, bool match_lines, char eolbyte)
87{
88  struct dfamust const *dm;
89  const char *err;
90
91  if (cregex->dfa.musts)
92    {
93      kwsinit (&cregex->ckwset, match_icase, match_words, match_lines, eolbyte);
94      /* First, we compile in the substrings known to be exact
95	 matches.  The kwset matcher will return the index
96	 of the matching string that it chooses. */
97      for (dm = cregex->dfa.musts; dm; dm = dm->next)
98	{
99	  if (!dm->exact)
100	    continue;
101	  cregex->kwset_exact_matches++;
102	  if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL)
103	    error (exit_failure, 0, err);
104	}
105      /* Now, we compile the substrings that will require
106	 the use of the regexp matcher.  */
107      for (dm = cregex->dfa.musts; dm; dm = dm->next)
108	{
109	  if (dm->exact)
110	    continue;
111	  if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL)
112	    error (exit_failure, 0, err);
113	}
114      if ((err = kwsprep (cregex->ckwset.kwset)) != NULL)
115	error (exit_failure, 0, err);
116    }
117}
118
119static void *
120Gcompile (const char *pattern, size_t pattern_size,
121	  bool match_icase, bool match_words, bool match_lines, char eolbyte)
122{
123  struct compiled_regex *cregex;
124  const char *err;
125  const char *sep;
126  size_t total = pattern_size;
127  const char *motif = pattern;
128
129  cregex = XMALLOC (struct compiled_regex);
130  memset (cregex, '\0', sizeof (struct compiled_regex));
131  cregex->match_words = match_words;
132  cregex->match_lines = match_lines;
133  cregex->eolbyte = eolbyte;
134  cregex->patterns = NULL;
135  cregex->pcount = 0;
136
137  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
138  dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
139
140  /* For GNU regex compiler we have to pass the patterns separately to detect
141     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
142     GNU regex should have raise a syntax error.  The same for backref, where
143     the backref should have been local to each pattern.  */
144  do
145    {
146      size_t len;
147      sep = (const char *) memchr (motif, '\n', total);
148      if (sep)
149	{
150	  len = sep - motif;
151	  sep++;
152	  total -= (len + 1);
153	}
154      else
155	{
156	  len = total;
157	  total = 0;
158	}
159
160      cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
161      memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
162
163      if ((err = re_compile_pattern (motif, len,
164				     &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
165	error (exit_failure, 0, err);
166      cregex->pcount++;
167
168      motif = sep;
169    } while (sep && total != 0);
170
171  /* In the match_words and match_lines cases, we use a different pattern
172     for the DFA matcher that will quickly throw out cases that won't work.
173     Then if DFA succeeds we do some hairy stuff using the regex matcher
174     to decide whether the match should really count. */
175  if (match_words || match_lines)
176    {
177      /* In the whole-word case, we use the pattern:
178	 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
179	 In the whole-line case, we use the pattern:
180	 ^\(userpattern\)$.  */
181
182      static const char line_beg[] = "^\\(";
183      static const char line_end[] = "\\)$";
184      static const char word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
185      static const char word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
186      char *n = XNMALLOC (sizeof word_beg - 1 + pattern_size + sizeof word_end, char);
187      size_t i;
188      strcpy (n, match_lines ? line_beg : word_beg);
189      i = strlen (n);
190      memcpy (n + i, pattern, pattern_size);
191      i += pattern_size;
192      strcpy (n + i, match_lines ? line_end : word_end);
193      i += strlen (n + i);
194      pattern = n;
195      pattern_size = i;
196    }
197
198  dfacomp (pattern, pattern_size, &cregex->dfa, 1);
199  kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);
200
201  return cregex;
202}
203
204static void *
205compile (const char *pattern, size_t pattern_size,
206	 bool match_icase, bool match_words, bool match_lines, char eolbyte,
207	 reg_syntax_t syntax)
208{
209  struct compiled_regex *cregex;
210  const char *err;
211  const char *sep;
212  size_t total = pattern_size;
213  const char *motif = pattern;
214
215  cregex = XMALLOC (struct compiled_regex);
216  memset (cregex, '\0', sizeof (struct compiled_regex));
217  cregex->match_words = match_words;
218  cregex->match_lines = match_lines;
219  cregex->eolbyte = eolbyte;
220  cregex->patterns = NULL;
221  cregex->pcount = 0;
222
223  re_set_syntax (syntax);
224  dfasyntax (syntax, match_icase, eolbyte);
225
226  /* For GNU regex compiler we have to pass the patterns separately to detect
227     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
228     GNU regex should have raise a syntax error.  The same for backref, where
229     the backref should have been local to each pattern.  */
230  do
231    {
232      size_t len;
233      sep = (const char *) memchr (motif, '\n', total);
234      if (sep)
235	{
236	  len = sep - motif;
237	  sep++;
238	  total -= (len + 1);
239	}
240      else
241	{
242	  len = total;
243	  total = 0;
244	}
245
246      cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
247      memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
248
249      if ((err = re_compile_pattern (motif, len,
250				     &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
251	error (exit_failure, 0, err);
252      cregex->pcount++;
253
254      motif = sep;
255    } while (sep && total != 0);
256
257  /* In the match_words and match_lines cases, we use a different pattern
258     for the DFA matcher that will quickly throw out cases that won't work.
259     Then if DFA succeeds we do some hairy stuff using the regex matcher
260     to decide whether the match should really count. */
261  if (match_words || match_lines)
262    {
263      /* In the whole-word case, we use the pattern:
264	 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
265	 In the whole-line case, we use the pattern:
266	 ^(userpattern)$.  */
267
268      static const char line_beg[] = "^(";
269      static const char line_end[] = ")$";
270      static const char word_beg[] = "(^|[^[:alnum:]_])(";
271      static const char word_end[] = ")([^[:alnum:]_]|$)";
272      char *n = XNMALLOC (sizeof word_beg - 1 + pattern_size + sizeof word_end, char);
273      size_t i;
274      strcpy (n, match_lines ? line_beg : word_beg);
275      i = strlen(n);
276      memcpy (n + i, pattern, pattern_size);
277      i += pattern_size;
278      strcpy (n + i, match_lines ? line_end : word_end);
279      i += strlen (n + i);
280      pattern = n;
281      pattern_size = i;
282    }
283
284  dfacomp (pattern, pattern_size, &cregex->dfa, 1);
285  kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);
286
287  return cregex;
288}
289
290static void *
291Ecompile (const char *pattern, size_t pattern_size,
292	  bool match_icase, bool match_words, bool match_lines, char eolbyte)
293{
294  return compile (pattern, pattern_size,
295		  match_icase, match_words, match_lines, eolbyte,
296		  RE_SYNTAX_POSIX_EGREP);
297}
298
299static void *
300AWKcompile (const char *pattern, size_t pattern_size,
301	    bool match_icase, bool match_words, bool match_lines, char eolbyte)
302{
303  return compile (pattern, pattern_size,
304		  match_icase, match_words, match_lines, eolbyte,
305		  RE_SYNTAX_AWK);
306}
307
308static size_t
309EGexecute (const void *compiled_pattern,
310	   const char *buf, size_t buf_size,
311	   size_t *match_size, bool exact)
312{
313  struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
314  register const char *buflim, *beg, *end;
315  char eol = cregex->eolbyte;
316  int backref, start, len;
317  struct kwsmatch kwsm;
318  size_t i;
319#ifdef MBS_SUPPORT
320  char *mb_properties = NULL;
321#endif /* MBS_SUPPORT */
322
323#ifdef MBS_SUPPORT
324  if (MB_CUR_MAX > 1 && cregex->ckwset.kwset)
325    mb_properties = check_multibyte_string (buf, buf_size);
326#endif /* MBS_SUPPORT */
327
328  buflim = buf + buf_size;
329
330  for (beg = end = buf; end < buflim; beg = end)
331    {
332      if (!exact)
333	{
334	  if (cregex->ckwset.kwset)
335	    {
336	      /* Find a possible match using the KWset matcher. */
337	      size_t offset = kwsexec (cregex->ckwset.kwset, beg, buflim - beg, &kwsm);
338	      if (offset == (size_t) -1)
339		{
340#ifdef MBS_SUPPORT
341		  if (MB_CUR_MAX > 1)
342		    free (mb_properties);
343#endif
344		  return (size_t)-1;
345		}
346	      beg += offset;
347	      /* Narrow down to the line containing the candidate, and
348		 run it through DFA. */
349	      end = (const char *) memchr (beg, eol, buflim - beg);
350	      if (end != NULL)
351		end++;
352	      else
353		end = buflim;
354#ifdef MBS_SUPPORT
355	      if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
356		continue;
357#endif
358	      while (beg > buf && beg[-1] != eol)
359		--beg;
360	      if (kwsm.index < cregex->kwset_exact_matches)
361		goto success;
362	      if (dfaexec (&cregex->dfa, beg, end - beg, &backref) == (size_t) -1)
363		continue;
364	    }
365	  else
366	    {
367	      /* No good fixed strings; start with DFA. */
368	      size_t offset = dfaexec (&cregex->dfa, beg, buflim - beg, &backref);
369	      if (offset == (size_t) -1)
370		break;
371	      /* Narrow down to the line we've found. */
372	      beg += offset;
373	      end = (const char *) memchr (beg, eol, buflim - beg);
374	      if (end != NULL)
375		end++;
376	      else
377		end = buflim;
378	      while (beg > buf && beg[-1] != eol)
379		--beg;
380	    }
381	  /* Successful, no backreferences encountered! */
382	  if (!backref)
383	    goto success;
384	}
385      else
386	end = beg + buf_size;
387
388      /* If we've made it to this point, this means DFA has seen
389	 a probable match, and we need to run it through Regex. */
390      for (i = 0; i < cregex->pcount; i++)
391	{
392	  cregex->patterns[i].regexbuf.not_eol = 0;
393	  if (0 <= (start = re_search (&(cregex->patterns[i].regexbuf), beg,
394				       end - beg - 1, 0,
395				       end - beg - 1, &(cregex->patterns[i].regs))))
396	    {
397	      len = cregex->patterns[i].regs.end[0] - start;
398	      if (exact)
399		{
400		  *match_size = len;
401		  return start;
402		}
403	      if ((!cregex->match_lines && !cregex->match_words)
404		  || (cregex->match_lines && len == end - beg - 1))
405		goto success;
406	      /* If -w, check if the match aligns with word boundaries.
407		 We do this iteratively because:
408		 (a) the line may contain more than one occurence of the
409		 pattern, and
410		 (b) Several alternatives in the pattern might be valid at a
411		 given point, and we may need to consider a shorter one to
412		 find a word boundary.  */
413	      if (cregex->match_words)
414		while (start >= 0)
415		  {
416		    if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1]))
417			&& (len == end - beg - 1
418			    || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len])))
419		      goto success;
420		    if (len > 0)
421		      {
422			/* Try a shorter length anchored at the same place. */
423			--len;
424			cregex->patterns[i].regexbuf.not_eol = 1;
425			len = re_match (&(cregex->patterns[i].regexbuf), beg,
426					start + len, start,
427					&(cregex->patterns[i].regs));
428		      }
429		    if (len <= 0)
430		      {
431			/* Try looking further on. */
432			if (start == end - beg - 1)
433			  break;
434			++start;
435			cregex->patterns[i].regexbuf.not_eol = 0;
436			start = re_search (&(cregex->patterns[i].regexbuf), beg,
437					   end - beg - 1,
438					   start, end - beg - 1 - start,
439					   &(cregex->patterns[i].regs));
440			len = cregex->patterns[i].regs.end[0] - start;
441		      }
442		  }
443	    }
444	} /* for Regex patterns.  */
445    } /* for (beg = end ..) */
446#ifdef MBS_SUPPORT
447  if (MB_CUR_MAX > 1 && mb_properties)
448    free (mb_properties);
449#endif /* MBS_SUPPORT */
450  return (size_t) -1;
451
452 success:
453#ifdef MBS_SUPPORT
454  if (MB_CUR_MAX > 1 && mb_properties)
455    free (mb_properties);
456#endif /* MBS_SUPPORT */
457  *match_size = end - beg;
458  return beg - buf;
459}
460
461static void
462EGfree (void *compiled_pattern)
463{
464  struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
465
466  dfafree (&cregex->dfa);
467  free (cregex->patterns);
468  free (cregex->ckwset.trans);
469  free (cregex);
470}
471
472/* POSIX Basic Regular Expressions */
473matcher_t matcher_grep =
474  {
475    Gcompile,
476    EGexecute,
477    EGfree
478  };
479
480/* POSIX Extended Regular Expressions */
481matcher_t matcher_egrep =
482  {
483    Ecompile,
484    EGexecute,
485    EGfree
486  };
487
488/* AWK Regular Expressions */
489matcher_t matcher_awk =
490  {
491    AWKcompile,
492    EGexecute,
493    EGfree
494  };
495