1/*	$NetBSD: file.c,v 1.5 2011/02/16 18:35:39 joerg Exp $	*/
2/*	$FreeBSD: stable/11/usr.bin/grep/file.c 354628 2019-11-11 19:54:08Z kevans $	*/
3/*	$OpenBSD: file.c,v 1.11 2010/07/02 20:48:48 nicm Exp $	*/
4
5/*-
6 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
7 *
8 * Copyright (c) 1999 James Howard and Dag-Erling Co��dan Sm��rgrav
9 * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
10 * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
11 * All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: stable/11/usr.bin/grep/file.c 354628 2019-11-11 19:54:08Z kevans $");
37
38#include <sys/param.h>
39#include <sys/mman.h>
40#include <sys/stat.h>
41#include <sys/types.h>
42
43#include <err.h>
44#include <errno.h>
45#include <fcntl.h>
46#include <stddef.h>
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
50#include <wchar.h>
51#include <wctype.h>
52#include <zlib.h>
53
54#ifndef WITHOUT_LZMA
55#include <lzma.h>
56#endif
57
58#ifndef WITHOUT_BZIP2
59#include <bzlib.h>
60#endif
61
62#include "grep.h"
63
64#define	MAXBUFSIZ	(32 * 1024)
65#define	LNBUFBUMP	80
66
67static gzFile gzbufdesc;
68#ifndef WITHOUT_LZMA
69static lzma_stream lstrm = LZMA_STREAM_INIT;
70static lzma_action laction;
71static uint8_t lin_buf[MAXBUFSIZ];
72#endif
73#ifndef WITHOUT_BZIP2
74static BZFILE* bzbufdesc;
75#endif
76
77static unsigned char *buffer;
78static unsigned char *bufpos;
79static size_t bufrem;
80static size_t fsiz;
81
82static unsigned char *lnbuf;
83static size_t lnbuflen;
84
85static inline int
86grep_refill(struct file *f)
87{
88	ssize_t nr;
89#ifndef WITHOUT_LZMA
90	lzma_ret lzmaret;
91#endif
92
93	if (filebehave == FILE_MMAP)
94		return (0);
95
96	bufpos = buffer;
97	bufrem = 0;
98
99	switch (filebehave) {
100	case FILE_GZIP:
101		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
102		break;
103#ifndef WITHOUT_BZIP2
104	case FILE_BZIP:
105		if (bzbufdesc != NULL) {
106			int bzerr;
107
108			nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
109			switch (bzerr) {
110			case BZ_OK:
111			case BZ_STREAM_END:
112				/* No problem, nr will be okay */
113				break;
114			case BZ_DATA_ERROR_MAGIC:
115				/*
116				 * As opposed to gzread(), which simply returns the
117				 * plain file data, if it is not in the correct
118				 * compressed format, BZ2_bzRead() instead aborts.
119				 *
120				 * So, just restart at the beginning of the file again,
121				 * and use plain reads from now on.
122				 */
123				BZ2_bzReadClose(&bzerr, bzbufdesc);
124				bzbufdesc = NULL;
125				if (lseek(f->fd, 0, SEEK_SET) == -1)
126					return (-1);
127				nr = read(f->fd, buffer, MAXBUFSIZ);
128				break;
129			default:
130				/* Make sure we exit with an error */
131				nr = -1;
132			}
133		} else
134			/*
135			 * Also an error case; we should never have a scenario
136			 * where we have an open file but no bzip descriptor
137			 * at this point. See: grep_open
138			 */
139			nr = -1;
140		break;
141#endif
142#ifndef WITHOUT_LZMA
143	case FILE_XZ:
144	case FILE_LZMA:
145		lstrm.next_out = buffer;
146
147		do {
148			if (lstrm.avail_in == 0) {
149				lstrm.next_in = lin_buf;
150				nr = read(f->fd, lin_buf, MAXBUFSIZ);
151
152				if (nr < 0)
153					return (-1);
154				else if (nr == 0)
155					laction = LZMA_FINISH;
156
157				lstrm.avail_in = nr;
158			}
159
160			lzmaret = lzma_code(&lstrm, laction);
161
162			if (lzmaret != LZMA_OK && lzmaret != LZMA_STREAM_END)
163				return (-1);
164
165			if (lstrm.avail_out == 0 || lzmaret == LZMA_STREAM_END) {
166				bufrem = MAXBUFSIZ - lstrm.avail_out;
167				lstrm.next_out = buffer;
168				lstrm.avail_out = MAXBUFSIZ;
169			}
170		} while (bufrem == 0 && lzmaret != LZMA_STREAM_END);
171
172		return (0);
173#endif	/* WITHOUT_LZMA */
174	default:
175		nr = read(f->fd, buffer, MAXBUFSIZ);
176	}
177	if (nr < 0)
178		return (-1);
179
180	bufrem = nr;
181	return (0);
182}
183
184static inline int
185grep_lnbufgrow(size_t newlen)
186{
187
188	if (lnbuflen < newlen) {
189		lnbuf = grep_realloc(lnbuf, newlen);
190		lnbuflen = newlen;
191	}
192
193	return (0);
194}
195
196char *
197grep_fgetln(struct file *f, struct parsec *pc)
198{
199	unsigned char *p;
200	char *ret;
201	size_t len;
202	size_t off;
203	ptrdiff_t diff;
204
205	/* Fill the buffer, if necessary */
206	if (bufrem == 0 && grep_refill(f) != 0)
207		goto error;
208
209	if (bufrem == 0) {
210		/* Return zero length to indicate EOF */
211		pc->ln.len= 0;
212		return (bufpos);
213	}
214
215	/* Look for a newline in the remaining part of the buffer */
216	if ((p = memchr(bufpos, fileeol, bufrem)) != NULL) {
217		++p; /* advance over newline */
218		ret = bufpos;
219		len = p - bufpos;
220		bufrem -= len;
221		bufpos = p;
222		pc->ln.len = len;
223		return (ret);
224	}
225
226	/* We have to copy the current buffered data to the line buffer */
227	for (len = bufrem, off = 0; ; len += bufrem) {
228		/* Make sure there is room for more data */
229		if (grep_lnbufgrow(len + LNBUFBUMP))
230			goto error;
231		memcpy(lnbuf + off, bufpos, len - off);
232		/* With FILE_MMAP, this is EOF; there's no more to refill */
233		if (filebehave == FILE_MMAP) {
234			bufrem -= len;
235			break;
236		}
237		off = len;
238		/* Fetch more to try and find EOL/EOF */
239		if (grep_refill(f) != 0)
240			goto error;
241		if (bufrem == 0)
242			/* EOF: return partial line */
243			break;
244		if ((p = memchr(bufpos, fileeol, bufrem)) == NULL)
245			continue;
246		/* got it: finish up the line (like code above) */
247		++p;
248		diff = p - bufpos;
249		len += diff;
250		if (grep_lnbufgrow(len))
251		    goto error;
252		memcpy(lnbuf + off, bufpos, diff);
253		bufrem -= diff;
254		bufpos = p;
255		break;
256	}
257	pc->ln.len = len;
258	return (lnbuf);
259
260error:
261	pc->ln.len = 0;
262	return (NULL);
263}
264
265/*
266 * Opens a file for processing.
267 */
268struct file *
269grep_open(const char *path)
270{
271	struct file *f;
272#ifndef WITHOUT_LZMA
273	lzma_ret lzmaret;
274#endif
275
276	f = grep_malloc(sizeof *f);
277	memset(f, 0, sizeof *f);
278	if (path == NULL) {
279		/* Processing stdin implies --line-buffered. */
280		lbflag = true;
281		f->fd = STDIN_FILENO;
282	} else if ((f->fd = open(path, O_RDONLY)) == -1)
283		goto error1;
284
285	if (filebehave == FILE_MMAP) {
286		struct stat st;
287
288		if ((fstat(f->fd, &st) == -1) || (st.st_size > OFF_MAX) ||
289		    (!S_ISREG(st.st_mode)))
290			filebehave = FILE_STDIO;
291		else {
292			int flags = MAP_PRIVATE | MAP_NOCORE | MAP_NOSYNC;
293#ifdef MAP_PREFAULT_READ
294			flags |= MAP_PREFAULT_READ;
295#endif
296			fsiz = st.st_size;
297			buffer = mmap(NULL, fsiz, PROT_READ, flags,
298			     f->fd, (off_t)0);
299			if (buffer == MAP_FAILED)
300				filebehave = FILE_STDIO;
301			else {
302				bufrem = st.st_size;
303				bufpos = buffer;
304				madvise(buffer, st.st_size, MADV_SEQUENTIAL);
305			}
306		}
307	}
308
309	if ((buffer == NULL) || (buffer == MAP_FAILED))
310		buffer = grep_malloc(MAXBUFSIZ);
311
312	switch (filebehave) {
313	case FILE_GZIP:
314		if ((gzbufdesc = gzdopen(f->fd, "r")) == NULL)
315			goto error2;
316		break;
317#ifndef WITHOUT_BZIP2
318	case FILE_BZIP:
319		if ((bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
320			goto error2;
321		break;
322#endif
323#ifndef WITHOUT_LZMA
324	case FILE_XZ:
325	case FILE_LZMA:
326
327		if (filebehave == FILE_XZ)
328			lzmaret = lzma_stream_decoder(&lstrm, UINT64_MAX,
329			    LZMA_CONCATENATED);
330		else
331			lzmaret = lzma_alone_decoder(&lstrm, UINT64_MAX);
332
333		if (lzmaret != LZMA_OK)
334			goto error2;
335
336		lstrm.avail_in = 0;
337		lstrm.avail_out = MAXBUFSIZ;
338		laction = LZMA_RUN;
339		break;
340#endif
341	}
342
343	/* Fill read buffer, also catches errors early */
344	if (bufrem == 0 && grep_refill(f) != 0)
345		goto error2;
346
347	/* Check for binary stuff, if necessary */
348	if (binbehave != BINFILE_TEXT && fileeol != '\0' &&
349	    memchr(bufpos, '\0', bufrem) != NULL)
350		f->binary = true;
351
352	return (f);
353
354error2:
355	close(f->fd);
356error1:
357	free(f);
358	return (NULL);
359}
360
361/*
362 * Closes a file.
363 */
364void
365grep_close(struct file *f)
366{
367
368	close(f->fd);
369
370	/* Reset read buffer and line buffer */
371	if (filebehave == FILE_MMAP) {
372		munmap(buffer, fsiz);
373		buffer = NULL;
374	}
375	bufpos = buffer;
376	bufrem = 0;
377
378	free(lnbuf);
379	lnbuf = NULL;
380	lnbuflen = 0;
381}
382