util.c revision 323443
1/*	$NetBSD: util.c,v 1.9 2011/02/27 17:33:37 joerg Exp $	*/
2/*	$FreeBSD: stable/11/usr.bin/grep/util.c 323443 2017-09-11 15:52:24Z kevans $	*/
3/*	$OpenBSD: util.c,v 1.39 2010/07/02 22:18:03 tedu Exp $	*/
4
5/*-
6 * Copyright (c) 1999 James Howard and Dag-Erling Co��dan Sm��rgrav
7 * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
8 * Copyright (C) 2017 Kyle Evans <kevans@FreeBSD.org>
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: stable/11/usr.bin/grep/util.c 323443 2017-09-11 15:52:24Z kevans $");
35
36#include <sys/stat.h>
37#include <sys/types.h>
38
39#include <ctype.h>
40#include <err.h>
41#include <errno.h>
42#include <fnmatch.h>
43#include <fts.h>
44#include <libgen.h>
45#include <stdbool.h>
46#include <stdio.h>
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
50#include <wchar.h>
51#include <wctype.h>
52
53#ifndef WITHOUT_FASTMATCH
54#include "fastmatch.h"
55#endif
56#include "grep.h"
57
58static bool	 first_match = true;
59
60/*
61 * Parsing context; used to hold things like matches made and
62 * other useful bits
63 */
64struct parsec {
65	regmatch_t	matches[MAX_MATCHES];		/* Matches made */
66	struct str	ln;				/* Current line */
67	size_t		lnstart;			/* Position in line */
68	size_t		matchidx;			/* Latest match index */
69	int		printed;			/* Metadata printed? */
70	bool		binary;				/* Binary file? */
71};
72
73#ifdef WITH_INTERNAL_NOSPEC
74static int litexec(const struct pat *pat, const char *string,
75    size_t nmatch, regmatch_t pmatch[]);
76#endif
77static int procline(struct parsec *pc);
78static void printline(struct parsec *pc, int sep);
79static void printline_metadata(struct str *line, int sep);
80
81bool
82file_matching(const char *fname)
83{
84	char *fname_base, *fname_buf;
85	bool ret;
86
87	ret = finclude ? false : true;
88	fname_buf = strdup(fname);
89	if (fname_buf == NULL)
90		err(2, "strdup");
91	fname_base = basename(fname_buf);
92
93	for (unsigned int i = 0; i < fpatterns; ++i) {
94		if (fnmatch(fpattern[i].pat, fname, 0) == 0 ||
95		    fnmatch(fpattern[i].pat, fname_base, 0) == 0) {
96			if (fpattern[i].mode == EXCL_PAT) {
97				ret = false;
98				break;
99			} else
100				ret = true;
101		}
102	}
103	free(fname_buf);
104	return (ret);
105}
106
107static inline bool
108dir_matching(const char *dname)
109{
110	bool ret;
111
112	ret = dinclude ? false : true;
113
114	for (unsigned int i = 0; i < dpatterns; ++i) {
115		if (dname != NULL &&
116		    fnmatch(dpattern[i].pat, dname, 0) == 0) {
117			if (dpattern[i].mode == EXCL_PAT)
118				return (false);
119			else
120				ret = true;
121		}
122	}
123	return (ret);
124}
125
126/*
127 * Processes a directory when a recursive search is performed with
128 * the -R option.  Each appropriate file is passed to procfile().
129 */
130int
131grep_tree(char **argv)
132{
133	FTS *fts;
134	FTSENT *p;
135	int c, fts_flags;
136	bool ok;
137	const char *wd[] = { ".", NULL };
138
139	c = fts_flags = 0;
140
141	switch(linkbehave) {
142	case LINK_EXPLICIT:
143		fts_flags = FTS_COMFOLLOW;
144		break;
145	case LINK_SKIP:
146		fts_flags = FTS_PHYSICAL;
147		break;
148	default:
149		fts_flags = FTS_LOGICAL;
150
151	}
152
153	fts_flags |= FTS_NOSTAT | FTS_NOCHDIR;
154
155	fts = fts_open((argv[0] == NULL) ?
156	    __DECONST(char * const *, wd) : argv, fts_flags, NULL);
157	if (fts == NULL)
158		err(2, "fts_open");
159	while ((p = fts_read(fts)) != NULL) {
160		switch (p->fts_info) {
161		case FTS_DNR:
162			/* FALLTHROUGH */
163		case FTS_ERR:
164			file_err = true;
165			if(!sflag)
166				warnx("%s: %s", p->fts_path, strerror(p->fts_errno));
167			break;
168		case FTS_D:
169			/* FALLTHROUGH */
170		case FTS_DP:
171			if (dexclude || dinclude)
172				if (!dir_matching(p->fts_name) ||
173				    !dir_matching(p->fts_path))
174					fts_set(fts, p, FTS_SKIP);
175			break;
176		case FTS_DC:
177			/* Print a warning for recursive directory loop */
178			warnx("warning: %s: recursive directory loop",
179				p->fts_path);
180			break;
181		default:
182			/* Check for file exclusion/inclusion */
183			ok = true;
184			if (fexclude || finclude)
185				ok &= file_matching(p->fts_path);
186
187			if (ok)
188				c += procfile(p->fts_path);
189			break;
190		}
191	}
192
193	fts_close(fts);
194	return (c);
195}
196
197/*
198 * Opens a file and processes it.  Each file is processed line-by-line
199 * passing the lines to procline().
200 */
201int
202procfile(const char *fn)
203{
204	struct parsec pc;
205	long long tail;
206	struct file *f;
207	struct stat sb;
208	struct str *ln;
209	mode_t s;
210	int c, last_outed, t;
211	bool doctx, printmatch, same_file;
212
213	if (strcmp(fn, "-") == 0) {
214		fn = label != NULL ? label : getstr(1);
215		f = grep_open(NULL);
216	} else {
217		if (!stat(fn, &sb)) {
218			/* Check if we need to process the file */
219			s = sb.st_mode & S_IFMT;
220			if (s == S_IFDIR && dirbehave == DIR_SKIP)
221				return (0);
222			if ((s == S_IFIFO || s == S_IFCHR || s == S_IFBLK
223				|| s == S_IFSOCK) && devbehave == DEV_SKIP)
224					return (0);
225		}
226		f = grep_open(fn);
227	}
228	if (f == NULL) {
229		file_err = true;
230		if (!sflag)
231			warn("%s", fn);
232		return (0);
233	}
234
235	/* Convenience */
236	ln = &pc.ln;
237	pc.ln.file = grep_malloc(strlen(fn) + 1);
238	strcpy(pc.ln.file, fn);
239	pc.ln.line_no = 0;
240	pc.ln.len = 0;
241	pc.ln.boff = 0;
242	pc.ln.off = -1;
243	pc.binary = f->binary;
244	pc.printed = 0;
245	tail = 0;
246	last_outed = 0;
247	same_file = false;
248	doctx = false;
249	printmatch = true;
250	if ((pc.binary && binbehave == BINFILE_BIN) || cflag || qflag ||
251	    lflag || Lflag)
252		printmatch = false;
253	if (printmatch && (Aflag != 0 || Bflag != 0))
254		doctx = true;
255	mcount = mlimit;
256
257	for (c = 0;  c == 0 || !(lflag || qflag); ) {
258		/* Reset per-line statistics */
259		pc.printed = 0;
260		pc.matchidx = 0;
261		pc.lnstart = 0;
262		pc.ln.boff = 0;
263		pc.ln.off += pc.ln.len + 1;
264		if ((pc.ln.dat = grep_fgetln(f, &pc.ln.len)) == NULL ||
265		    pc.ln.len == 0)
266			break;
267
268		if (pc.ln.len > 0 && pc.ln.dat[pc.ln.len - 1] == fileeol)
269			--pc.ln.len;
270		pc.ln.line_no++;
271
272		/* Return if we need to skip a binary file */
273		if (pc.binary && binbehave == BINFILE_SKIP) {
274			grep_close(f);
275			free(pc.ln.file);
276			free(f);
277			return (0);
278		}
279
280		if ((t = procline(&pc)) == 0)
281			++c;
282
283		/* Deal with any -B context or context separators */
284		if (t == 0 && doctx) {
285			if (!first_match && (!same_file || last_outed > 0))
286				printf("--\n");
287			if (Bflag > 0)
288				printqueue();
289			tail = Aflag;
290		}
291		/* Print the matching line, but only if not quiet/binary */
292		if (t == 0 && printmatch) {
293			printline(&pc, ':');
294			while (pc.matchidx >= MAX_MATCHES) {
295				/* Reset matchidx and try again */
296				pc.matchidx = 0;
297				if (procline(&pc) == 0)
298					printline(&pc, ':');
299				else
300					break;
301			}
302			first_match = false;
303			same_file = true;
304			last_outed = 0;
305		}
306		if (t != 0 && doctx) {
307			/* Deal with any -A context */
308			if (tail > 0) {
309				grep_printline(&pc.ln, '-');
310				tail--;
311				if (Bflag > 0)
312					clearqueue();
313			} else {
314				/*
315				 * Enqueue non-matching lines for -B context.
316				 * If we're not actually doing -B context or if
317				 * the enqueue resulted in a line being rotated
318				 * out, then go ahead and increment last_outed
319				 * to signify a gap between context/match.
320				 */
321				if (Bflag == 0 || (Bflag > 0 && enqueue(ln)))
322					++last_outed;
323			}
324		}
325
326		/* Count the matches if we have a match limit */
327		if (t == 0 && mflag) {
328			--mcount;
329			if (mflag && mcount <= 0)
330				break;
331		}
332
333	}
334	if (Bflag > 0)
335		clearqueue();
336	grep_close(f);
337
338	if (cflag) {
339		if (!hflag)
340			printf("%s:", pc.ln.file);
341		printf("%u\n", c);
342	}
343	if (lflag && !qflag && c != 0)
344		printf("%s%c", fn, nullflag ? 0 : '\n');
345	if (Lflag && !qflag && c == 0)
346		printf("%s%c", fn, nullflag ? 0 : '\n');
347	if (c && !cflag && !lflag && !Lflag &&
348	    binbehave == BINFILE_BIN && f->binary && !qflag)
349		printf(getstr(8), fn);
350
351	free(pc.ln.file);
352	free(f);
353	return (c);
354}
355
356#ifdef WITH_INTERNAL_NOSPEC
357/*
358 * Internal implementation of literal string search within a string, modeled
359 * after regexec(3), for use when the regex(3) implementation doesn't offer
360 * either REG_NOSPEC or REG_LITERAL. This does not apply in the default FreeBSD
361 * config, but in other scenarios such as building against libgnuregex or on
362 * some non-FreeBSD OSes.
363 */
364static int
365litexec(const struct pat *pat, const char *string, size_t nmatch,
366    regmatch_t pmatch[])
367{
368	char *(*strstr_fn)(const char *, const char *);
369	char *sub, *subject;
370	const char *search;
371	size_t idx, n, ofs, stringlen;
372
373	if (cflags & REG_ICASE)
374		strstr_fn = strcasestr;
375	else
376		strstr_fn = strstr;
377	idx = 0;
378	ofs = pmatch[0].rm_so;
379	stringlen = pmatch[0].rm_eo;
380	if (ofs >= stringlen)
381		return (REG_NOMATCH);
382	subject = strndup(string, stringlen);
383	if (subject == NULL)
384		return (REG_ESPACE);
385	for (n = 0; ofs < stringlen;) {
386		search = (subject + ofs);
387		if ((unsigned long)pat->len > strlen(search))
388			break;
389		sub = strstr_fn(search, pat->pat);
390		/*
391		 * Ignoring the empty string possibility due to context: grep optimizes
392		 * for empty patterns and will never reach this point.
393		 */
394		if (sub == NULL)
395			break;
396		++n;
397		/* Fill in pmatch if necessary */
398		if (nmatch > 0) {
399			pmatch[idx].rm_so = ofs + (sub - search);
400			pmatch[idx].rm_eo = pmatch[idx].rm_so + pat->len;
401			if (++idx == nmatch)
402				break;
403			ofs = pmatch[idx].rm_so + 1;
404		} else
405			/* We only needed to know if we match or not */
406			break;
407	}
408	free(subject);
409	if (n > 0 && nmatch > 0)
410		for (n = idx; n < nmatch; ++n)
411			pmatch[n].rm_so = pmatch[n].rm_eo = -1;
412
413	return (n > 0 ? 0 : REG_NOMATCH);
414}
415#endif /* WITH_INTERNAL_NOSPEC */
416
417#define iswword(x)	(iswalnum((x)) || (x) == L'_')
418
419/*
420 * Processes a line comparing it with the specified patterns.  Each pattern
421 * is looped to be compared along with the full string, saving each and every
422 * match, which is necessary to colorize the output and to count the
423 * matches.  The matching lines are passed to printline() to display the
424 * appropriate output.
425 */
426static int
427procline(struct parsec *pc)
428{
429	regmatch_t pmatch, lastmatch, chkmatch;
430	wchar_t wbegin, wend;
431	size_t st, nst;
432	unsigned int i;
433	int c = 0, r = 0, lastmatches = 0, leflags = eflags;
434	size_t startm = 0, matchidx;
435	unsigned int retry;
436
437	matchidx = pc->matchidx;
438
439	/* Special case: empty pattern with -w flag, check first character */
440	if (matchall && wflag) {
441		if (pc->ln.len == 0)
442			return (0);
443		wend = L' ';
444		if (sscanf(&pc->ln.dat[0], "%lc", &wend) != 1 || iswword(wend))
445			return (1);
446		else
447			return (0);
448	} else if (matchall)
449		return (0);
450
451	st = pc->lnstart;
452	nst = 0;
453	/* Initialize to avoid a false positive warning from GCC. */
454	lastmatch.rm_so = lastmatch.rm_eo = 0;
455
456	/* Loop to process the whole line */
457	while (st <= pc->ln.len) {
458		lastmatches = 0;
459		startm = matchidx;
460		retry = 0;
461		if (st > 0 && pc->ln.dat[st - 1] != fileeol)
462			leflags |= REG_NOTBOL;
463		/* Loop to compare with all the patterns */
464		for (i = 0; i < patterns; i++) {
465			pmatch.rm_so = st;
466			pmatch.rm_eo = pc->ln.len;
467#ifdef WITH_INTERNAL_NOSPEC
468			if (grepbehave == GREP_FIXED)
469				r = litexec(&pattern[i], pc->ln.dat, 1, &pmatch);
470			else
471#endif
472#ifndef WITHOUT_FASTMATCH
473			if (fg_pattern[i].pattern)
474				r = fastexec(&fg_pattern[i],
475				    pc->ln.dat, 1, &pmatch, leflags);
476			else
477#endif
478				r = regexec(&r_pattern[i], pc->ln.dat, 1,
479				    &pmatch, leflags);
480			if (r != 0)
481				continue;
482			/* Check for full match */
483			if (xflag && (pmatch.rm_so != 0 ||
484			    (size_t)pmatch.rm_eo != pc->ln.len))
485				continue;
486			/* Check for whole word match */
487#ifndef WITHOUT_FASTMATCH
488			if (wflag || fg_pattern[i].word) {
489#else
490			if (wflag) {
491#endif
492				wbegin = wend = L' ';
493				if (pmatch.rm_so != 0 &&
494				    sscanf(&pc->ln.dat[pmatch.rm_so - 1],
495				    "%lc", &wbegin) != 1)
496					r = REG_NOMATCH;
497				else if ((size_t)pmatch.rm_eo !=
498				    pc->ln.len &&
499				    sscanf(&pc->ln.dat[pmatch.rm_eo],
500				    "%lc", &wend) != 1)
501					r = REG_NOMATCH;
502				else if (iswword(wbegin) ||
503				    iswword(wend))
504					r = REG_NOMATCH;
505				/*
506				 * If we're doing whole word matching and we
507				 * matched once, then we should try the pattern
508				 * again after advancing just past the start of
509				 * the earliest match. This allows the pattern
510				 * to  match later on in the line and possibly
511				 * still match a whole word.
512				 */
513				if (r == REG_NOMATCH &&
514				    (retry == pc->lnstart ||
515				    (unsigned int)pmatch.rm_so + 1 < retry))
516					retry = pmatch.rm_so + 1;
517				if (r == REG_NOMATCH)
518					continue;
519			}
520			lastmatches++;
521			lastmatch = pmatch;
522
523			if (matchidx == 0)
524				c++;
525
526			/*
527			 * Replace previous match if the new one is earlier
528			 * and/or longer. This will lead to some amount of
529			 * extra work if -o/--color are specified, but it's
530			 * worth it from a correctness point of view.
531			 */
532			if (matchidx > startm) {
533				chkmatch = pc->matches[matchidx - 1];
534				if (pmatch.rm_so < chkmatch.rm_so ||
535				    (pmatch.rm_so == chkmatch.rm_so &&
536				    (pmatch.rm_eo - pmatch.rm_so) >
537				    (chkmatch.rm_eo - chkmatch.rm_so))) {
538					pc->matches[matchidx - 1] = pmatch;
539					nst = pmatch.rm_eo;
540				}
541			} else {
542				/* Advance as normal if not */
543				pc->matches[matchidx++] = pmatch;
544				nst = pmatch.rm_eo;
545			}
546			/* avoid excessive matching - skip further patterns */
547			if ((color == NULL && !oflag) || qflag || lflag ||
548			    matchidx >= MAX_MATCHES) {
549				pc->lnstart = nst;
550				lastmatches = 0;
551				break;
552			}
553		}
554
555		/*
556		 * Advance to just past the start of the earliest match, try
557		 * again just in case we still have a chance to match later in
558		 * the string.
559		 */
560		if (lastmatches == 0 && retry > pc->lnstart) {
561			st = retry;
562			continue;
563		}
564
565		/* One pass if we are not recording matches */
566		if (!wflag && ((color == NULL && !oflag) || qflag || lflag || Lflag))
567			break;
568
569		/* If we didn't have any matches or REG_NOSUB set */
570		if (lastmatches == 0 || (cflags & REG_NOSUB))
571			nst = pc->ln.len;
572
573		if (lastmatches == 0)
574			/* No matches */
575			break;
576		else if (st == nst && lastmatch.rm_so == lastmatch.rm_eo)
577			/* Zero-length match -- advance one more so we don't get stuck */
578			nst++;
579
580		/* Advance st based on previous matches */
581		st = nst;
582		pc->lnstart = st;
583	}
584
585	/* Reflect the new matchidx in the context */
586	pc->matchidx = matchidx;
587	if (vflag)
588		c = !c;
589	return (c ? 0 : 1);
590}
591
592/*
593 * Safe malloc() for internal use.
594 */
595void *
596grep_malloc(size_t size)
597{
598	void *ptr;
599
600	if ((ptr = malloc(size)) == NULL)
601		err(2, "malloc");
602	return (ptr);
603}
604
605/*
606 * Safe calloc() for internal use.
607 */
608void *
609grep_calloc(size_t nmemb, size_t size)
610{
611	void *ptr;
612
613	if ((ptr = calloc(nmemb, size)) == NULL)
614		err(2, "calloc");
615	return (ptr);
616}
617
618/*
619 * Safe realloc() for internal use.
620 */
621void *
622grep_realloc(void *ptr, size_t size)
623{
624
625	if ((ptr = realloc(ptr, size)) == NULL)
626		err(2, "realloc");
627	return (ptr);
628}
629
630/*
631 * Safe strdup() for internal use.
632 */
633char *
634grep_strdup(const char *str)
635{
636	char *ret;
637
638	if ((ret = strdup(str)) == NULL)
639		err(2, "strdup");
640	return (ret);
641}
642
643/*
644 * Print an entire line as-is, there are no inline matches to consider. This is
645 * used for printing context.
646 */
647void grep_printline(struct str *line, int sep) {
648	printline_metadata(line, sep);
649	fwrite(line->dat, line->len, 1, stdout);
650	putchar(fileeol);
651}
652
653static void
654printline_metadata(struct str *line, int sep)
655{
656	bool printsep;
657
658	printsep = false;
659	if (!hflag) {
660		if (!nullflag) {
661			fputs(line->file, stdout);
662			printsep = true;
663		} else {
664			printf("%s", line->file);
665			putchar(0);
666		}
667	}
668	if (nflag) {
669		if (printsep)
670			putchar(sep);
671		printf("%d", line->line_no);
672		printsep = true;
673	}
674	if (bflag) {
675		if (printsep)
676			putchar(sep);
677		printf("%lld", (long long)(line->off + line->boff));
678		printsep = true;
679	}
680	if (printsep)
681		putchar(sep);
682}
683
684/*
685 * Prints a matching line according to the command line options.
686 */
687static void
688printline(struct parsec *pc, int sep)
689{
690	size_t a = 0;
691	size_t i, matchidx;
692	regmatch_t match;
693
694	/* If matchall, everything matches but don't actually print for -o */
695	if (oflag && matchall)
696		return;
697
698	matchidx = pc->matchidx;
699
700	/* --color and -o */
701	if ((oflag || color) && matchidx > 0) {
702		/* Only print metadata once per line if --color */
703		if (!oflag && pc->printed == 0)
704			printline_metadata(&pc->ln, sep);
705		for (i = 0; i < matchidx; i++) {
706			match = pc->matches[i];
707			/* Don't output zero length matches */
708			if (match.rm_so == match.rm_eo)
709				continue;
710			/*
711			 * Metadata is printed on a per-line basis, so every
712			 * match gets file metadata with the -o flag.
713			 */
714			if (oflag) {
715				pc->ln.boff = match.rm_so;
716				printline_metadata(&pc->ln, sep);
717			} else
718				fwrite(pc->ln.dat + a, match.rm_so - a, 1,
719				    stdout);
720			if (color)
721				fprintf(stdout, "\33[%sm\33[K", color);
722			fwrite(pc->ln.dat + match.rm_so,
723			    match.rm_eo - match.rm_so, 1, stdout);
724			if (color)
725				fprintf(stdout, "\33[m\33[K");
726			a = match.rm_eo;
727			if (oflag)
728				putchar('\n');
729		}
730		if (!oflag) {
731			if (pc->ln.len - a > 0)
732				fwrite(pc->ln.dat + a, pc->ln.len - a, 1,
733				    stdout);
734			putchar('\n');
735		}
736	} else
737		grep_printline(&pc->ln, sep);
738	pc->printed++;
739}
740