Deleted Added
full compact
search.c (126435) search.c (131557)
1/* search.c - searching subroutines using dfa, kwset and regex for grep.
2 Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8

--- 4 unchanged lines hidden (view full) ---

13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 02111-1307, USA. */
18
19/* Written August 1992 by Mike Haertel. */
20
1/* search.c - searching subroutines using dfa, kwset and regex for grep.
2 Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8

--- 4 unchanged lines hidden (view full) ---

13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 02111-1307, USA. */
18
19/* Written August 1992 by Mike Haertel. */
20
21/* $FreeBSD: head/gnu/usr.bin/grep/search.c 126435 2004-03-01 08:37:20Z ache $ */
21/* $FreeBSD: head/gnu/usr.bin/grep/search.c 131557 2004-07-04 10:02:03Z tjr $ */
22
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26#include <sys/types.h>
22
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26#include <sys/types.h>
27#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
28/* We can handle multibyte string. */
29# define MBS_SUPPORT
30# include <wchar.h>
31# include <wctype.h>
32#endif
33
27#include "system.h"
28#include "grep.h"
29#include "regex.h"
30#include "dfa.h"
31#include "kwset.h"
34#include "system.h"
35#include "grep.h"
36#include "regex.h"
37#include "dfa.h"
38#include "kwset.h"
39#include "error.h"
40#include "xalloc.h"
41#ifdef HAVE_LIBPCRE
42# include <pcre.h>
43#endif
32
33#define NCHAR (UCHAR_MAX + 1)
34
44
45#define NCHAR (UCHAR_MAX + 1)
46
35static void Gcompile PARAMS((char *, size_t));
36static void Ecompile PARAMS((char *, size_t));
37static char *EGexecute PARAMS((char *, size_t, char **));
38static void Fcompile PARAMS((char *, size_t));
39static char *Fexecute PARAMS((char *, size_t, char **));
40static void kwsinit PARAMS((void));
41
42/* Here is the matchers vector for the main program. */
43struct matcher matchers[] = {
44 { "default", Gcompile, EGexecute },
45 { "grep", Gcompile, EGexecute },
46 { "egrep", Ecompile, EGexecute },
47 { "awk", Ecompile, EGexecute },
48 { "fgrep", Fcompile, Fexecute },
49 { 0, 0, 0 },
50};
51
52/* For -w, we also consider _ to be word constituent. */
53#define WCHAR(C) (ISALNUM(C) || (C) == '_')
54
55/* DFA compiled regexp. */
56static struct dfa dfa;
57
47/* For -w, we also consider _ to be word constituent. */
48#define WCHAR(C) (ISALNUM(C) || (C) == '_')
49
50/* DFA compiled regexp. */
51static struct dfa dfa;
52
58/* Regex compiled regexp. */
59static struct re_pattern_buffer regexbuf;
53/* The Regex compiled patterns. */
54static struct patterns
55{
56 /* Regex compiled regexp. */
57 struct re_pattern_buffer regexbuf;
58 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
59 Q@#%!# library interface in regex.c. */
60} patterns0;
60
61
62struct patterns *patterns;
63size_t pcount;
64
61/* KWset compiled pattern. For Ecompile and Gcompile, we compile
62 a list of strings, at least one of which is known to occur in
63 any string matching the regexp. */
64static kwset_t kwset;
65
65/* KWset compiled pattern. For Ecompile and Gcompile, we compile
66 a list of strings, at least one of which is known to occur in
67 any string matching the regexp. */
68static kwset_t kwset;
69
66/* Last compiled fixed string known to exactly match the regexp.
67 If kwsexec() returns < lastexact, then we don't need to
70/* Number of compiled fixed strings known to exactly match the regexp.
71 If kwsexec returns < kwset_exact_matches, then we don't need to
68 call the regexp matcher at all. */
72 call the regexp matcher at all. */
69static int lastexact;
73static int kwset_exact_matches;
70
74
75#if defined(MBS_SUPPORT)
76static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
77#endif
78static void kwsinit PARAMS ((void));
79static void kwsmusts PARAMS ((void));
80static void Gcompile PARAMS ((char const *, size_t));
81static void Ecompile PARAMS ((char const *, size_t));
82static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
83static void Fcompile PARAMS ((char const *, size_t));
84static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
85static void Pcompile PARAMS ((char const *, size_t ));
86static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
87
71void
72dfaerror (char const *mesg)
73{
88void
89dfaerror (char const *mesg)
90{
74 fatal(mesg, 0);
91 error (2, 0, mesg);
75}
76
77static void
78kwsinit (void)
79{
80 static char trans[NCHAR];
81 int i;
82
83 if (match_icase)
84 for (i = 0; i < NCHAR; ++i)
92}
93
94static void
95kwsinit (void)
96{
97 static char trans[NCHAR];
98 int i;
99
100 if (match_icase)
101 for (i = 0; i < NCHAR; ++i)
85 trans[i] = TOLOWER(i);
102 trans[i] = TOLOWER (i);
86
103
87 if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0)))
88 fatal("memory exhausted", 0);
104 if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
105 error (2, 0, _("memory exhausted"));
89}
90
91/* If the DFA turns out to have some set of fixed strings one of
92 which must occur in the match, then we build a kwset matcher
93 to find those strings, and thus quickly filter out impossible
94 matches. */
95static void
96kwsmusts (void)
97{
106}
107
108/* If the DFA turns out to have some set of fixed strings one of
109 which must occur in the match, then we build a kwset matcher
110 to find those strings, and thus quickly filter out impossible
111 matches. */
112static void
113kwsmusts (void)
114{
98 struct dfamust *dm;
99 char *err;
115 struct dfamust const *dm;
116 char const *err;
100
101 if (dfa.musts)
102 {
117
118 if (dfa.musts)
119 {
103 kwsinit();
120 kwsinit ();
104 /* First, we compile in the substrings known to be exact
105 matches. The kwset matcher will return the index
106 of the matching string that it chooses. */
107 for (dm = dfa.musts; dm; dm = dm->next)
108 {
109 if (!dm->exact)
110 continue;
121 /* First, we compile in the substrings known to be exact
122 matches. The kwset matcher will return the index
123 of the matching string that it chooses. */
124 for (dm = dfa.musts; dm; dm = dm->next)
125 {
126 if (!dm->exact)
127 continue;
111 ++lastexact;
112 if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
113 fatal(err, 0);
128 ++kwset_exact_matches;
129 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
130 error (2, 0, err);
114 }
115 /* Now, we compile the substrings that will require
116 the use of the regexp matcher. */
117 for (dm = dfa.musts; dm; dm = dm->next)
118 {
119 if (dm->exact)
120 continue;
131 }
132 /* Now, we compile the substrings that will require
133 the use of the regexp matcher. */
134 for (dm = dfa.musts; dm; dm = dm->next)
135 {
136 if (dm->exact)
137 continue;
121 if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
122 fatal(err, 0);
138 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
139 error (2, 0, err);
123 }
140 }
124 if ((err = kwsprep(kwset)) != 0)
125 fatal(err, 0);
141 if ((err = kwsprep (kwset)) != 0)
142 error (2, 0, err);
126 }
127}
128
143 }
144}
145
146#ifdef MBS_SUPPORT
147/* This function allocate the array which correspond to "buf".
148 Then this check multibyte string and mark on the positions which
149 are not singlebyte character nor the first byte of a multibyte
150 character. Caller must free the array. */
151static char*
152check_multibyte_string(char const *buf, size_t size)
153{
154 char *mb_properties = malloc(size);
155 mbstate_t cur_state;
156 int i;
157 memset(&cur_state, 0, sizeof(mbstate_t));
158 memset(mb_properties, 0, sizeof(char)*size);
159 for (i = 0; i < size ;)
160 {
161 size_t mbclen;
162 mbclen = mbrlen(buf + i, size - i, &cur_state);
163
164 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
165 {
166 /* An invalid sequence, or a truncated multibyte character.
167 We treat it as a singlebyte character. */
168 mbclen = 1;
169 }
170 mb_properties[i] = mbclen;
171 i += mbclen;
172 }
173
174 return mb_properties;
175}
176#endif
177
129static void
178static void
130Gcompile (char *pattern, size_t size)
179Gcompile (char const *pattern, size_t size)
131{
132 const char *err;
180{
181 const char *err;
182 char const *sep;
183 size_t total = size;
184 char const *motif = pattern;
133
185
134 re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
135 dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
186 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
187 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
136
188
137 if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
138 fatal(err, 0);
189 /* For GNU regex compiler we have to pass the patterns separately to detect
190 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
191 GNU regex should have raise a syntax error. The same for backref, where
192 the backref should have been local to each pattern. */
193 do
194 {
195 size_t len;
196 sep = memchr (motif, '\n', total);
197 if (sep)
198 {
199 len = sep - motif;
200 sep++;
201 total -= (len + 1);
202 }
203 else
204 {
205 len = total;
206 total = 0;
207 }
139
208
209 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
210 if (patterns == NULL)
211 error (2, errno, _("memory exhausted"));
212
213 patterns[pcount] = patterns0;
214
215 if ((err = re_compile_pattern (motif, len,
216 &(patterns[pcount].regexbuf))) != 0)
217 error (2, 0, err);
218 pcount++;
219
220 motif = sep;
221 } while (sep && total != 0);
222
140 /* In the match_words and match_lines cases, we use a different pattern
141 for the DFA matcher that will quickly throw out cases that won't work.
142 Then if DFA succeeds we do some hairy stuff using the regex matcher
143 to decide whether the match should really count. */
144 if (match_words || match_lines)
145 {
146 /* In the whole-word case, we use the pattern:
223 /* In the match_words and match_lines cases, we use a different pattern
224 for the DFA matcher that will quickly throw out cases that won't work.
225 Then if DFA succeeds we do some hairy stuff using the regex matcher
226 to decide whether the match should really count. */
227 if (match_words || match_lines)
228 {
229 /* In the whole-word case, we use the pattern:
147 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
230 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
148 In the whole-line case, we use the pattern:
231 In the whole-line case, we use the pattern:
149 ^(userpattern)$.
150 BUG: Using [A-Za-z_] is locale-dependent!
151 So will use [:alnum:] */
232 ^\(userpattern\)$. */
152
233
153 char *n = malloc(size + 50);
154 int i = 0;
155
156 strcpy(n, "");
157
158 if (match_lines)
159 strcpy(n, "^\\(");
160 if (match_words)
161 strcpy(n, "\\(^\\|[^[:alnum:]_]\\)\\(");
162
163 i = strlen(n);
164 memcpy(n + i, pattern, size);
234 static char const line_beg[] = "^\\(";
235 static char const line_end[] = "\\)$";
236 static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
237 static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
238 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
239 size_t i;
240 strcpy (n, match_lines ? line_beg : word_beg);
241 i = strlen (n);
242 memcpy (n + i, pattern, size);
165 i += size;
243 i += size;
166
167 if (match_words)
168 strcpy(n + i, "\\)\\([^[:alnum:]_]\\|$\\)");
169 if (match_lines)
170 strcpy(n + i, "\\)$");
171
172 i += strlen(n + i);
173 dfacomp(n, i, &dfa, 1);
244 strcpy (n + i, match_lines ? line_end : word_end);
245 i += strlen (n + i);
246 pattern = n;
247 size = i;
174 }
248 }
175 else
176 dfacomp(pattern, size, &dfa, 1);
177
249
178 kwsmusts();
250 dfacomp (pattern, size, &dfa, 1);
251 kwsmusts ();
179}
180
181static void
252}
253
254static void
182Ecompile (char *pattern, size_t size)
255Ecompile (char const *pattern, size_t size)
183{
184 const char *err;
256{
257 const char *err;
258 const char *sep;
259 size_t total = size;
260 char const *motif = pattern;
185
261
186 if (strcmp(matcher, "awk") == 0)
262 if (strcmp (matcher, "awk") == 0)
187 {
263 {
188 re_set_syntax(RE_SYNTAX_AWK);
189 dfasyntax(RE_SYNTAX_AWK, match_icase, eolbyte);
264 re_set_syntax (RE_SYNTAX_AWK);
265 dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
190 }
191 else
192 {
193 re_set_syntax (RE_SYNTAX_POSIX_EGREP);
194 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
195 }
196
266 }
267 else
268 {
269 re_set_syntax (RE_SYNTAX_POSIX_EGREP);
270 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
271 }
272
197 if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
198 fatal(err, 0);
273 /* For GNU regex compiler we have to pass the patterns separately to detect
274 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
275 GNU regex should have raise a syntax error. The same for backref, where
276 the backref should have been local to each pattern. */
277 do
278 {
279 size_t len;
280 sep = memchr (motif, '\n', total);
281 if (sep)
282 {
283 len = sep - motif;
284 sep++;
285 total -= (len + 1);
286 }
287 else
288 {
289 len = total;
290 total = 0;
291 }
199
292
293 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
294 if (patterns == NULL)
295 error (2, errno, _("memory exhausted"));
296 patterns[pcount] = patterns0;
297
298 if ((err = re_compile_pattern (motif, len,
299 &(patterns[pcount].regexbuf))) != 0)
300 error (2, 0, err);
301 pcount++;
302
303 motif = sep;
304 } while (sep && total != 0);
305
200 /* In the match_words and match_lines cases, we use a different pattern
201 for the DFA matcher that will quickly throw out cases that won't work.
202 Then if DFA succeeds we do some hairy stuff using the regex matcher
203 to decide whether the match should really count. */
204 if (match_words || match_lines)
205 {
206 /* In the whole-word case, we use the pattern:
306 /* In the match_words and match_lines cases, we use a different pattern
307 for the DFA matcher that will quickly throw out cases that won't work.
308 Then if DFA succeeds we do some hairy stuff using the regex matcher
309 to decide whether the match should really count. */
310 if (match_words || match_lines)
311 {
312 /* In the whole-word case, we use the pattern:
207 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
313 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
208 In the whole-line case, we use the pattern:
314 In the whole-line case, we use the pattern:
209 ^(userpattern)$.
210 BUG: Using [A-Za-z_] is locale-dependent!
211 so will use the char class */
315 ^(userpattern)$. */
212
316
213 char *n = malloc(size + 50);
214 int i = 0;
215
216 strcpy(n, "");
217
218 if (match_lines)
219 strcpy(n, "^(");
220 if (match_words)
221 strcpy(n, "(^|[^[:alnum:]_])(");
222
317 static char const line_beg[] = "^(";
318 static char const line_end[] = ")$";
319 static char const word_beg[] = "(^|[^[:alnum:]_])(";
320 static char const word_end[] = ")([^[:alnum:]_]|$)";
321 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
322 size_t i;
323 strcpy (n, match_lines ? line_beg : word_beg);
223 i = strlen(n);
324 i = strlen(n);
224 memcpy(n + i, pattern, size);
325 memcpy (n + i, pattern, size);
225 i += size;
326 i += size;
226
227 if (match_words)
228 strcpy(n + i, ")([^[:alnum:]_]|$)");
229 if (match_lines)
230 strcpy(n + i, ")$");
231
232 i += strlen(n + i);
233 dfacomp(n, i, &dfa, 1);
327 strcpy (n + i, match_lines ? line_end : word_end);
328 i += strlen (n + i);
329 pattern = n;
330 size = i;
234 }
331 }
235 else
236 dfacomp(pattern, size, &dfa, 1);
237
332
238 kwsmusts();
333 dfacomp (pattern, size, &dfa, 1);
334 kwsmusts ();
239}
240
335}
336
241static char *
242EGexecute (char *buf, size_t size, char **endp)
337static size_t
338EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
243{
339{
244 register char *buflim, *beg, *end, save;
340 register char const *buflim, *beg, *end;
245 char eol = eolbyte;
246 int backref, start, len;
247 struct kwsmatch kwsm;
341 char eol = eolbyte;
342 int backref, start, len;
343 struct kwsmatch kwsm;
248 static struct re_registers regs; /* This is static on account of a BRAIN-DEAD
249 Q@#%!# library interface in regex.c. */
344 size_t i;
345#ifdef MBS_SUPPORT
346 char *mb_properties = NULL;
347#endif /* MBS_SUPPORT */
250
348
349#ifdef MBS_SUPPORT
350 if (MB_CUR_MAX > 1 && kwset)
351 mb_properties = check_multibyte_string(buf, size);
352#endif /* MBS_SUPPORT */
353
251 buflim = buf + size;
252
354 buflim = buf + size;
355
253 for (beg = end = buf; end < buflim; beg = end + 1)
356 for (beg = end = buf; end < buflim; beg = end)
254 {
357 {
255 if (kwset)
358 if (!exact)
256 {
359 {
257 /* Find a possible match using the KWset matcher. */
258 beg = kwsexec(kwset, beg, buflim - beg, &kwsm);
259 if (!beg)
260 goto failure;
261 /* Narrow down to the line containing the candidate, and
262 run it through DFA. */
263 end = memchr(beg, eol, buflim - beg);
264 if (!end)
265 end = buflim;
266 while (beg > buf && beg[-1] != eol)
267 --beg;
268 save = *end;
269 if (kwsm.index < lastexact)
270 goto success;
271 if (!dfaexec(&dfa, beg, end, 0, (int *) 0, &backref))
360 if (kwset)
272 {
361 {
273 *end = save;
274 continue;
362 /* Find a possible match using the KWset matcher. */
363 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
364 if (offset == (size_t) -1)
365 {
366#ifdef MBS_SUPPORT
367 if (MB_CUR_MAX > 1)
368 free(mb_properties);
369#endif
370 return (size_t)-1;
371 }
372 beg += offset;
373 /* Narrow down to the line containing the candidate, and
374 run it through DFA. */
375 end = memchr(beg, eol, buflim - beg);
376 end++;
377#ifdef MBS_SUPPORT
378 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
379 continue;
380#endif
381 while (beg > buf && beg[-1] != eol)
382 --beg;
383 if (kwsm.index < kwset_exact_matches)
384 goto success;
385 if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
386 continue;
275 }
387 }
276 *end = save;
277 /* Successful, no backreferences encountered. */
278 if (!backref)
279 goto success;
280 }
281 else
282 {
283 /* No good fixed strings; start with DFA. */
284 save = *buflim;
285 beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref);
286 *buflim = save;
287 if (!beg)
288 goto failure;
289 /* Narrow down to the line we've found. */
290 end = memchr(beg, eol, buflim - beg);
291 if (!end)
292 end = buflim;
293 while (beg > buf && beg[-1] != eol)
294 --beg;
388 else
389 {
390 /* No good fixed strings; start with DFA. */
391 size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
392 if (offset == (size_t) -1)
393 break;
394 /* Narrow down to the line we've found. */
395 beg += offset;
396 end = memchr (beg, eol, buflim - beg);
397 end++;
398 while (beg > buf && beg[-1] != eol)
399 --beg;
400 }
295 /* Successful, no backreferences encountered! */
296 if (!backref)
297 goto success;
298 }
401 /* Successful, no backreferences encountered! */
402 if (!backref)
403 goto success;
404 }
405 else
406 end = beg + size;
407
299 /* If we've made it to this point, this means DFA has seen
300 a probable match, and we need to run it through Regex. */
408 /* If we've made it to this point, this means DFA has seen
409 a probable match, and we need to run it through Regex. */
301 regexbuf.not_eol = 0;
302 if ((start = re_search(&regexbuf, beg, end - beg, 0, end - beg, &regs)) >= 0)
410 for (i = 0; i < pcount; i++)
303 {
411 {
304 len = regs.end[0] - start;
305 if ((!match_lines && !match_words)
306 || (match_lines && len == end - beg))
307 goto success;
308 /* If -w, check if the match aligns with word boundaries.
309 We do this iteratively because:
310 (a) the line may contain more than one occurence of the pattern, and
311 (b) Several alternatives in the pattern might be valid at a given
312 point, and we may need to consider a shorter one to find a word
313 boundary. */
314 if (match_words)
315 while (start >= 0)
316 {
317 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
318 && (len == end - beg
319 || !WCHAR ((unsigned char) beg[start + len])))
320 goto success;
321 if (len > 0)
412 patterns[i].regexbuf.not_eol = 0;
413 if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
414 end - beg - 1, 0,
415 end - beg - 1, &(patterns[i].regs))))
416 {
417 len = patterns[i].regs.end[0] - start;
418 if (exact)
419 {
420 *match_size = len;
421 return start;
422 }
423 if ((!match_lines && !match_words)
424 || (match_lines && len == end - beg - 1))
425 goto success;
426 /* If -w, check if the match aligns with word boundaries.
427 We do this iteratively because:
428 (a) the line may contain more than one occurence of the
429 pattern, and
430 (b) Several alternatives in the pattern might be valid at a
431 given point, and we may need to consider a shorter one to
432 find a word boundary. */
433 if (match_words)
434 while (start >= 0)
322 {
435 {
323 /* Try a shorter length anchored at the same place. */
324 --len;
325 regexbuf.not_eol = 1;
326 len = re_match(&regexbuf, beg, start + len, start, &regs);
436 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
437 && (len == end - beg - 1
438 || !WCHAR ((unsigned char) beg[start + len])))
439 goto success;
440 if (len > 0)
441 {
442 /* Try a shorter length anchored at the same place. */
443 --len;
444 patterns[i].regexbuf.not_eol = 1;
445 len = re_match (&(patterns[i].regexbuf), beg,
446 start + len, start,
447 &(patterns[i].regs));
448 }
449 if (len <= 0)
450 {
451 /* Try looking further on. */
452 if (start == end - beg - 1)
453 break;
454 ++start;
455 patterns[i].regexbuf.not_eol = 0;
456 start = re_search (&(patterns[i].regexbuf), beg,
457 end - beg - 1,
458 start, end - beg - 1 - start,
459 &(patterns[i].regs));
460 len = patterns[i].regs.end[0] - start;
461 }
327 }
462 }
328 if (len <= 0)
329 {
330 /* Try looking further on. */
331 if (start == end - beg)
332 break;
333 ++start;
334 regexbuf.not_eol = 0;
335 start = re_search(&regexbuf, beg, end - beg,
336 start, end - beg - start, &regs);
337 len = regs.end[0] - start;
338 }
339 }
340 }
341 }
463 }
464 } /* for Regex patterns. */
465 } /* for (beg = end ..) */
466#ifdef MBS_SUPPORT
467 if (MB_CUR_MAX > 1 && mb_properties)
468 free (mb_properties);
469#endif /* MBS_SUPPORT */
470 return (size_t) -1;
342
471
343 failure:
344 return 0;
345
346 success:
472 success:
347 *endp = end < buflim ? end + 1 : end;
348 return beg;
473#ifdef MBS_SUPPORT
474 if (MB_CUR_MAX > 1 && mb_properties)
475 free (mb_properties);
476#endif /* MBS_SUPPORT */
477 *match_size = end - beg;
478 return beg - buf;
349}
350
351static void
479}
480
481static void
352Fcompile (char *pattern, size_t size)
482Fcompile (char const *pattern, size_t size)
353{
483{
354 char *beg, *lim, *err;
484 char const *beg, *lim, *err;
355
485
356 kwsinit();
486 kwsinit ();
357 beg = pattern;
358 do
359 {
360 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
361 ;
487 beg = pattern;
488 do
489 {
490 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
491 ;
362 if ((err = kwsincr(kwset, beg, lim - beg)) != 0)
363 fatal(err, 0);
492 if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
493 error (2, 0, err);
364 if (lim < pattern + size)
365 ++lim;
366 beg = lim;
367 }
368 while (beg < pattern + size);
369
494 if (lim < pattern + size)
495 ++lim;
496 beg = lim;
497 }
498 while (beg < pattern + size);
499
370 if ((err = kwsprep(kwset)) != 0)
371 fatal(err, 0);
500 if ((err = kwsprep (kwset)) != 0)
501 error (2, 0, err);
372}
373
502}
503
374static char *
375Fexecute (char *buf, size_t size, char **endp)
504static size_t
505Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
376{
506{
377 register char *beg, *try, *end;
507 register char const *beg, *try, *end;
378 register size_t len;
379 char eol = eolbyte;
380 struct kwsmatch kwsmatch;
508 register size_t len;
509 char eol = eolbyte;
510 struct kwsmatch kwsmatch;
511#ifdef MBS_SUPPORT
512 char *mb_properties;
513 if (MB_CUR_MAX > 1)
514 mb_properties = check_multibyte_string (buf, size);
515#endif /* MBS_SUPPORT */
381
382 for (beg = buf; beg <= buf + size; ++beg)
383 {
516
517 for (beg = buf; beg <= buf + size; ++beg)
518 {
384 if (!(beg = kwsexec(kwset, beg, buf + size - beg, &kwsmatch)))
385 return 0;
519 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
520 if (offset == (size_t) -1)
521 {
522#ifdef MBS_SUPPORT
523 if (MB_CUR_MAX > 1)
524 free(mb_properties);
525#endif /* MBS_SUPPORT */
526 return offset;
527 }
528#ifdef MBS_SUPPORT
529 if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
530 continue; /* It is a part of multibyte character. */
531#endif /* MBS_SUPPORT */
532 beg += offset;
386 len = kwsmatch.size[0];
533 len = kwsmatch.size[0];
534 if (exact)
535 {
536 *match_size = len;
537#ifdef MBS_SUPPORT
538 if (MB_CUR_MAX > 1)
539 free (mb_properties);
540#endif /* MBS_SUPPORT */
541 return beg - buf;
542 }
387 if (match_lines)
388 {
389 if (beg > buf && beg[-1] != eol)
390 continue;
391 if (beg + len < buf + size && beg[len] != eol)
392 continue;
393 goto success;
394 }
395 else if (match_words)
543 if (match_lines)
544 {
545 if (beg > buf && beg[-1] != eol)
546 continue;
547 if (beg + len < buf + size && beg[len] != eol)
548 continue;
549 goto success;
550 }
551 else if (match_words)
396 for (try = beg; len && try;)
552 for (try = beg; len; )
397 {
398 if (try > buf && WCHAR((unsigned char) try[-1]))
399 break;
400 if (try + len < buf + size && WCHAR((unsigned char) try[len]))
401 {
553 {
554 if (try > buf && WCHAR((unsigned char) try[-1]))
555 break;
556 if (try + len < buf + size && WCHAR((unsigned char) try[len]))
557 {
402 try = kwsexec(kwset, beg, --len, &kwsmatch);
558 offset = kwsexec (kwset, beg, --len, &kwsmatch);
559 if (offset == (size_t) -1)
560 {
561#ifdef MBS_SUPPORT
562 if (MB_CUR_MAX > 1)
563 free (mb_properties);
564#endif /* MBS_SUPPORT */
565 return offset;
566 }
567 try = beg + offset;
403 len = kwsmatch.size[0];
404 }
405 else
406 goto success;
407 }
408 else
409 goto success;
410 }
411
568 len = kwsmatch.size[0];
569 }
570 else
571 goto success;
572 }
573 else
574 goto success;
575 }
576
412 return 0;
577#ifdef MBS_SUPPORT
578 if (MB_CUR_MAX > 1)
579 free (mb_properties);
580#endif /* MBS_SUPPORT */
581 return -1;
413
414 success:
582
583 success:
415 if ((end = memchr(beg + len, eol, (buf + size) - (beg + len))) != 0)
416 ++end;
417 else
418 end = buf + size;
419 *endp = end;
420 while (beg > buf && beg[-1] != '\n')
584 end = memchr (beg + len, eol, (buf + size) - (beg + len));
585 end++;
586 while (buf < beg && beg[-1] != eol)
421 --beg;
587 --beg;
422 return beg;
588 *match_size = end - beg;
589#ifdef MBS_SUPPORT
590 if (MB_CUR_MAX > 1)
591 free (mb_properties);
592#endif /* MBS_SUPPORT */
593 return beg - buf;
423}
594}
595
596#if HAVE_LIBPCRE
597/* Compiled internal form of a Perl regular expression. */
598static pcre *cre;
599
600/* Additional information about the pattern. */
601static pcre_extra *extra;
602#endif
603
604static void
605Pcompile (char const *pattern, size_t size)
606{
607#if !HAVE_LIBPCRE
608 error (2, 0, _("The -P option is not supported"));
609#else
610 int e;
611 char const *ep;
612 char *re = xmalloc (4 * size + 7);
613 int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
614 char const *patlim = pattern + size;
615 char *n = re;
616 char const *p;
617 char const *pnul;
618
619 /* FIXME: Remove this restriction. */
620 if (eolbyte != '\n')
621 error (2, 0, _("The -P and -z options cannot be combined"));
622
623 *n = '\0';
624 if (match_lines)
625 strcpy (n, "^(");
626 if (match_words)
627 strcpy (n, "\\b(");
628 n += strlen (n);
629
630 /* The PCRE interface doesn't allow NUL bytes in the pattern, so
631 replace each NUL byte in the pattern with the four characters
632 "\000", removing a preceding backslash if there are an odd
633 number of backslashes before the NUL.
634
635 FIXME: This method does not work with some multibyte character
636 encodings, notably Shift-JIS, where a multibyte character can end
637 in a backslash byte. */
638 for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
639 {
640 memcpy (n, p, pnul - p);
641 n += pnul - p;
642 for (p = pnul; pattern < p && p[-1] == '\\'; p--)
643 continue;
644 n -= (pnul - p) & 1;
645 strcpy (n, "\\000");
646 n += 4;
647 }
648
649 memcpy (n, p, patlim - p);
650 n += patlim - p;
651 *n = '\0';
652 if (match_words)
653 strcpy (n, ")\\b");
654 if (match_lines)
655 strcpy (n, ")$");
656
657 cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
658 if (!cre)
659 error (2, 0, ep);
660
661 extra = pcre_study (cre, 0, &ep);
662 if (ep)
663 error (2, 0, ep);
664
665 free (re);
666#endif
667}
668
669static size_t
670Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
671{
672#if !HAVE_LIBPCRE
673 abort ();
674 return -1;
675#else
676 /* This array must have at least two elements; everything after that
677 is just for performance improvement in pcre_exec. */
678 int sub[300];
679
680 int e = pcre_exec (cre, extra, buf, size, 0, 0,
681 sub, sizeof sub / sizeof *sub);
682
683 if (e <= 0)
684 {
685 switch (e)
686 {
687 case PCRE_ERROR_NOMATCH:
688 return -1;
689
690 case PCRE_ERROR_NOMEMORY:
691 error (2, 0, _("Memory exhausted"));
692
693 default:
694 abort ();
695 }
696 }
697 else
698 {
699 /* Narrow down to the line we've found. */
700 char const *beg = buf + sub[0];
701 char const *end = buf + sub[1];
702 char const *buflim = buf + size;
703 char eol = eolbyte;
704 if (!exact)
705 {
706 end = memchr (end, eol, buflim - end);
707 end++;
708 while (buf < beg && beg[-1] != eol)
709 --beg;
710 }
711
712 *match_size = end - beg;
713 return beg - buf;
714 }
715#endif
716}
717
718struct matcher const matchers[] = {
719 { "default", Gcompile, EGexecute },
720 { "grep", Gcompile, EGexecute },
721 { "egrep", Ecompile, EGexecute },
722 { "awk", Ecompile, EGexecute },
723 { "fgrep", Fcompile, Fexecute },
724 { "perl", Pcompile, Pexecute },
725 { "", 0, 0 },
726};