1/*	$OpenBSD: util.c,v 1.36 2007/10/02 17:59:18 otto Exp $	*/
2/*	$FreeBSD: head/usr.bin/grep/fastgrep.c 211496 2010-08-19 09:28:59Z des $ */
3
4/*-
5 * Copyright (c) 1999 James Howard and Dag-Erling Co��dan Sm��rgrav
6 * Copyright (C) 2008 Gabor Kovesdan <gabor@FreeBSD.org>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31/*
32 * XXX: This file is a speed up for grep to cover the defects of the
33 * regex library.  These optimizations should practically be implemented
34 * there keeping this code clean.  This is a future TODO, but for the
35 * meantime, we need to use this workaround.
36 */
37
38#if HAVE_NBTOOL_CONFIG_H
39#include "nbtool_config.h"
40#endif
41
42#include <sys/cdefs.h>
43__RCSID("$NetBSD: fastgrep.c,v 1.4 2011/02/27 17:33:37 joerg Exp $");
44
45#include <limits.h>
46#include <stdbool.h>
47#include <stdlib.h>
48#include <string.h>
49#include <wchar.h>
50#include <wctype.h>
51
52#include "grep.h"
53
54static inline int	grep_cmp(const unsigned char *, const unsigned char *, size_t);
55static inline void	grep_revstr(unsigned char *, int);
56
57void
58fgrepcomp(fastgrep_t *fg, const char *pat)
59{
60	unsigned int i;
61
62	/* Initialize. */
63	fg->len = strlen(pat);
64	fg->bol = false;
65	fg->eol = false;
66	fg->reversed = false;
67
68	fg->pattern = (unsigned char *)grep_strdup(pat);
69
70	/* Preprocess pattern. */
71	for (i = 0; i <= UCHAR_MAX; i++)
72		fg->qsBc[i] = fg->len;
73	for (i = 1; i < fg->len; i++)
74		fg->qsBc[fg->pattern[i]] = fg->len - i;
75}
76
77/*
78 * Returns: -1 on failure, 0 on success
79 */
80int
81fastcomp(fastgrep_t *fg, const char *pat)
82{
83	unsigned int i;
84	int firstHalfDot = -1;
85	int firstLastHalfDot = -1;
86	int hasDot = 0;
87	int lastHalfDot = 0;
88	int shiftPatternLen;
89
90	/* Initialize. */
91	fg->len = strlen(pat);
92	fg->bol = false;
93	fg->eol = false;
94	fg->reversed = false;
95	fg->word = wflag;
96
97	/* Remove end-of-line character ('$'). */
98	if (fg->len > 0 && pat[fg->len - 1] == '$') {
99		fg->eol = true;
100		fg->len--;
101	}
102
103	/* Remove beginning-of-line character ('^'). */
104	if (pat[0] == '^') {
105		fg->bol = true;
106		fg->len--;
107		pat++;
108	}
109
110	if (fg->len >= 14 &&
111	    memcmp(pat, "[[:<:]]", 7) == 0 &&
112	    memcmp(pat + fg->len - 7, "[[:>:]]", 7) == 0) {
113		fg->len -= 14;
114		pat += 7;
115		/* Word boundary is handled separately in util.c */
116		fg->word = true;
117	}
118
119	/*
120	 * pat has been adjusted earlier to not include '^', '$' or
121	 * the word match character classes at the beginning and ending
122	 * of the string respectively.
123	 */
124	fg->pattern = grep_malloc(fg->len + 1);
125	memcpy(fg->pattern, pat, fg->len);
126	fg->pattern[fg->len] = '\0';
127
128	/* Look for ways to cheat...er...avoid the full regex engine. */
129	for (i = 0; i < fg->len; i++) {
130		/* Can still cheat? */
131		if (fg->pattern[i] == '.') {
132			hasDot = i;
133			if (i < fg->len / 2) {
134				if (firstHalfDot < 0)
135					/* Closest dot to the beginning */
136					firstHalfDot = i;
137			} else {
138				/* Closest dot to the end of the pattern. */
139				lastHalfDot = i;
140				if (firstLastHalfDot < 0)
141					firstLastHalfDot = i;
142			}
143		} else {
144			/* Free memory and let others know this is empty. */
145			free(fg->pattern);
146			fg->pattern = NULL;
147			return (-1);
148		}
149	}
150
151	/*
152	 * Determine if a reverse search would be faster based on the placement
153	 * of the dots.
154	 */
155	if ((!(lflag || cflag)) && ((!(fg->bol || fg->eol)) &&
156	    ((lastHalfDot) && ((firstHalfDot < 0) ||
157	    ((fg->len - (lastHalfDot + 1)) < (size_t)firstHalfDot)))) &&
158	    !oflag && !color) {
159		fg->reversed = true;
160		hasDot = fg->len - (firstHalfDot < 0 ?
161		    firstLastHalfDot : firstHalfDot) - 1;
162		grep_revstr(fg->pattern, fg->len);
163	}
164
165	/*
166	 * Normal Quick Search would require a shift based on the position the
167	 * next character after the comparison is within the pattern.  With
168	 * wildcards, the position of the last dot effects the maximum shift
169	 * distance.
170	 * The closer to the end the wild card is the slower the search.  A
171	 * reverse version of this algorithm would be useful for wildcards near
172	 * the end of the string.
173	 *
174	 * Examples:
175	 * Pattern	Max shift
176	 * -------	---------
177	 * this		5
178	 * .his		4
179	 * t.is		3
180	 * th.s		2
181	 * thi.		1
182	 */
183
184	/* Adjust the shift based on location of the last dot ('.'). */
185	shiftPatternLen = fg->len - hasDot;
186
187	/* Preprocess pattern. */
188	for (i = 0; i <= (signed)UCHAR_MAX; i++)
189		fg->qsBc[i] = shiftPatternLen;
190	for (i = hasDot + 1; i < fg->len; i++) {
191		fg->qsBc[fg->pattern[i]] = fg->len - i;
192	}
193
194	/*
195	 * Put pattern back to normal after pre-processing to allow for easy
196	 * comparisons later.
197	 */
198	if (fg->reversed)
199		grep_revstr(fg->pattern, fg->len);
200
201	return (0);
202}
203
204int
205grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch)
206{
207	unsigned int j;
208	int ret = REG_NOMATCH;
209
210	if (pmatch->rm_so == (ssize_t)len)
211		return (ret);
212
213	if (fg->bol && pmatch->rm_so != 0) {
214		pmatch->rm_so = len;
215		pmatch->rm_eo = len;
216		return (ret);
217	}
218
219	/* No point in going farther if we do not have enough data. */
220	if (len < fg->len)
221		return (ret);
222
223	/* Only try once at the beginning or ending of the line. */
224	if (fg->bol || fg->eol) {
225		/* Simple text comparison. */
226		/* Verify data is >= pattern length before searching on it. */
227		if (len >= fg->len) {
228			/* Determine where in data to start search at. */
229			j = fg->eol ? len - fg->len : 0;
230			if (!((fg->bol && fg->eol) && (len != fg->len)))
231				if (grep_cmp(fg->pattern, data + j,
232				    fg->len) == -1) {
233					pmatch->rm_so = j;
234					pmatch->rm_eo = j + fg->len;
235						ret = 0;
236				}
237		}
238	} else if (fg->reversed) {
239		/* Quick Search algorithm. */
240		j = len;
241		do {
242			if (grep_cmp(fg->pattern, data + j - fg->len,
243				fg->len) == -1) {
244				pmatch->rm_so = j - fg->len;
245				pmatch->rm_eo = j;
246				ret = 0;
247				break;
248			}
249			/* Shift if within bounds, otherwise, we are done. */
250			if (j == fg->len)
251				break;
252			j -= fg->qsBc[data[j - fg->len - 1]];
253		} while (j >= fg->len);
254	} else {
255		/* Quick Search algorithm. */
256		j = pmatch->rm_so;
257		do {
258			if (grep_cmp(fg->pattern, data + j, fg->len) == -1) {
259				pmatch->rm_so = j;
260				pmatch->rm_eo = j + fg->len;
261				ret = 0;
262				break;
263			}
264
265			/* Shift if within bounds, otherwise, we are done. */
266			if (j + fg->len == len)
267				break;
268			else
269				j += fg->qsBc[data[j + fg->len]];
270		} while (j <= (len - fg->len));
271	}
272
273	return (ret);
274}
275
276/*
277 * Returns:	i >= 0 on failure (position that it failed)
278 *		-1 on success
279 */
280static inline int
281grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len)
282{
283	size_t size;
284	wchar_t *wdata, *wpat;
285	unsigned int i;
286
287	if (iflag) {
288		if ((size = mbstowcs(NULL, (const char *)data, 0)) ==
289		    ((size_t) - 1))
290			return (-1);
291
292		wdata = grep_malloc(size * sizeof(wint_t));
293
294		if (mbstowcs(wdata, (const char *)data, size) ==
295		    ((size_t) - 1))
296			return (-1);
297
298		if ((size = mbstowcs(NULL, (const char *)pat, 0)) ==
299		    ((size_t) - 1))
300			return (-1);
301
302		wpat = grep_malloc(size * sizeof(wint_t));
303
304		if (mbstowcs(wpat, (const char *)pat, size) == ((size_t) - 1))
305			return (-1);
306		for (i = 0; i < len; i++) {
307			if ((towlower(wpat[i]) == towlower(wdata[i])) ||
308			    ((grepbehave != GREP_FIXED) && wpat[i] == L'.'))
309				continue;
310			free(wpat);
311			free(wdata);
312				return (i);
313		}
314	} else {
315		for (i = 0; i < len; i++) {
316			if ((pat[i] == data[i]) || ((grepbehave != GREP_FIXED) &&
317			    pat[i] == '.'))
318				continue;
319			return (i);
320		}
321	}
322	return (-1);
323}
324
325static inline void
326grep_revstr(unsigned char *str, int len)
327{
328	int i;
329	char c;
330
331	for (i = 0; i < len / 2; i++) {
332		c = str[i];
333		str[i] = str[len - i - 1];
334		str[len - i - 1] = c;
335	}
336}
337