148981Ssheldonh/*	$NetBSD: file.c,v 1.5 2011/02/16 18:35:39 joerg Exp $	*/
248981Ssheldonh/*	$FreeBSD$	*/
348981Ssheldonh/*	$OpenBSD: file.c,v 1.11 2010/07/02 20:48:48 nicm Exp $	*/
448981Ssheldonh
548981Ssheldonh/*-
648981Ssheldonh * Copyright (c) 1999 James Howard and Dag-Erling Co��dan Sm��rgrav
748981Ssheldonh * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
848981Ssheldonh * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
948981Ssheldonh * All rights reserved.
1048981Ssheldonh *
1148981Ssheldonh * Redistribution and use in source and binary forms, with or without
1248981Ssheldonh * modification, are permitted provided that the following conditions
1348981Ssheldonh * are met:
1448981Ssheldonh * 1. Redistributions of source code must retain the above copyright
1548981Ssheldonh *    notice, this list of conditions and the following disclaimer.
1648981Ssheldonh * 2. Redistributions in binary form must reproduce the above copyright
1748981Ssheldonh *    notice, this list of conditions and the following disclaimer in the
1848981Ssheldonh *    documentation and/or other materials provided with the distribution.
1948981Ssheldonh *
2048981Ssheldonh * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
2148981Ssheldonh * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2248981Ssheldonh * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2348981Ssheldonh * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2448981Ssheldonh * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2548981Ssheldonh * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2648981Ssheldonh * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2748981Ssheldonh * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2848981Ssheldonh * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2950479Speter * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3048981Ssheldonh * SUCH DAMAGE.
3148981Ssheldonh */
3248981Ssheldonh
3348981Ssheldonh#include <sys/cdefs.h>
3478356Sdwmalone__FBSDID("$FreeBSD$");
35101474Sume
3648981Ssheldonh#include <sys/param.h>
3748981Ssheldonh#include <sys/mman.h>
3848981Ssheldonh#include <sys/stat.h>
3948981Ssheldonh#include <sys/types.h>
4048981Ssheldonh
4148981Ssheldonh#include <err.h>
4248981Ssheldonh#include <errno.h>
4348981Ssheldonh#include <fcntl.h>
4448981Ssheldonh#include <lzma.h>
4548981Ssheldonh#include <stddef.h>
4648981Ssheldonh#include <stdlib.h>
4756590Sshin#include <string.h>
4848981Ssheldonh#include <unistd.h>
4948981Ssheldonh#include <wchar.h>
5048981Ssheldonh#include <wctype.h>
5148981Ssheldonh#include <zlib.h>
52101474Sume
53101474Sume#ifndef WITHOUT_BZIP2
54101474Sume#include <bzlib.h>
55101474Sume#endif
56101474Sume
57101474Sume#include "grep.h"
58101474Sume
59101474Sume#define	MAXBUFSIZ	(32 * 1024)
60101474Sume#define	LNBUFBUMP	80
61101474Sume
62101474Sumestatic gzFile gzbufdesc;
63101474Sumestatic lzma_stream lstrm = LZMA_STREAM_INIT;
64101474Sume#ifndef WITHOUT_BZIP2
65101474Sumestatic BZFILE* bzbufdesc;
66101474Sume#endif
6748981Ssheldonh
6848981Ssheldonhstatic unsigned char *buffer;
6948981Ssheldonhstatic unsigned char *bufpos;
7056590Sshinstatic size_t bufrem;
7148981Ssheldonhstatic size_t fsiz;
7248981Ssheldonh
7348981Ssheldonhstatic unsigned char *lnbuf;
7448981Ssheldonhstatic size_t lnbuflen;
7548981Ssheldonh
7648981Ssheldonhstatic inline int
7748981Ssheldonhgrep_refill(struct file *f)
7848981Ssheldonh{
7948981Ssheldonh	ssize_t nr;
8048981Ssheldonh
8148981Ssheldonh	if (filebehave == FILE_MMAP)
8248981Ssheldonh		return (0);
8348981Ssheldonh
8448981Ssheldonh	bufpos = buffer;
8548981Ssheldonh	bufrem = 0;
8656590Sshin
8767514Sdwmalone	if (filebehave == FILE_GZIP) {
8856590Sshin		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
8948981Ssheldonh#ifndef WITHOUT_BZIP2
9056590Sshin	} else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
9156590Sshin		int bzerr;
9256590Sshin
9356590Sshin		nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
9478356Sdwmalone		switch (bzerr) {
9556590Sshin		case BZ_OK:
9656590Sshin		case BZ_STREAM_END:
9756590Sshin			/* No problem, nr will be okay */
9856590Sshin			break;
9978356Sdwmalone		case BZ_DATA_ERROR_MAGIC:
10071399Sdwmalone			/*
10178356Sdwmalone			 * As opposed to gzread(), which simply returns the
10278356Sdwmalone			 * plain file data, if it is not in the correct
10378356Sdwmalone			 * compressed format, BZ2_bzRead() instead aborts.
10448981Ssheldonh			 *
10548981Ssheldonh			 * So, just restart at the beginning of the file again,
10648981Ssheldonh			 * and use plain reads from now on.
10748981Ssheldonh			 */
10848981Ssheldonh			BZ2_bzReadClose(&bzerr, bzbufdesc);
10948981Ssheldonh			bzbufdesc = NULL;
11048981Ssheldonh			if (lseek(f->fd, 0, SEEK_SET) == -1)
11148981Ssheldonh				return (-1);
112236875Sdelphij			nr = read(f->fd, buffer, MAXBUFSIZ);
11348981Ssheldonh			break;
11456590Sshin		default:
11556590Sshin			/* Make sure we exit with an error */
11656590Sshin			nr = -1;
11756590Sshin		}
118101474Sume#endif
119101474Sume	} else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
12048981Ssheldonh		lzma_action action = LZMA_RUN;
12148981Ssheldonh		uint8_t in_buf[MAXBUFSIZ];
12256590Sshin		lzma_ret ret;
12356590Sshin
12456590Sshin		ret = (filebehave == FILE_XZ) ?
12598562Sjmallett		    lzma_stream_decoder(&lstrm, UINT64_MAX,
12698562Sjmallett		    LZMA_CONCATENATED) :
12798562Sjmallett		    lzma_alone_decoder(&lstrm, UINT64_MAX);
12898562Sjmallett
12998562Sjmallett		if (ret != LZMA_OK)
13098562Sjmallett			return (-1);
13198562Sjmallett
13298562Sjmallett		lstrm.next_out = buffer;
13348981Ssheldonh		lstrm.avail_out = MAXBUFSIZ;
13478694Sdwmalone		lstrm.next_in = in_buf;
13578694Sdwmalone		nr = read(f->fd, in_buf, MAXBUFSIZ);
13648981Ssheldonh
13798562Sjmallett		if (nr < 0)
13878694Sdwmalone			return (-1);
13948981Ssheldonh		else if (nr == 0)
14078694Sdwmalone			action = LZMA_FINISH;
14148981Ssheldonh
14248981Ssheldonh		lstrm.avail_in = nr;
14348981Ssheldonh		ret = lzma_code(&lstrm, action);
14478694Sdwmalone
14548981Ssheldonh		if (ret != LZMA_OK && ret != LZMA_STREAM_END)
146			return (-1);
147		bufrem = MAXBUFSIZ - lstrm.avail_out;
148		return (0);
149	} else
150		nr = read(f->fd, buffer, MAXBUFSIZ);
151
152	if (nr < 0)
153		return (-1);
154
155	bufrem = nr;
156	return (0);
157}
158
159static inline int
160grep_lnbufgrow(size_t newlen)
161{
162
163	if (lnbuflen < newlen) {
164		lnbuf = grep_realloc(lnbuf, newlen);
165		lnbuflen = newlen;
166	}
167
168	return (0);
169}
170
171char *
172grep_fgetln(struct file *f, size_t *lenp)
173{
174	unsigned char *p;
175	char *ret;
176	size_t len;
177	size_t off;
178	ptrdiff_t diff;
179
180	/* Fill the buffer, if necessary */
181	if (bufrem == 0 && grep_refill(f) != 0)
182		goto error;
183
184	if (bufrem == 0) {
185		/* Return zero length to indicate EOF */
186		*lenp = 0;
187		return (bufpos);
188	}
189
190	/* Look for a newline in the remaining part of the buffer */
191	if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
192		++p; /* advance over newline */
193		ret = bufpos;
194		len = p - bufpos;
195		bufrem -= len;
196		bufpos = p;
197		*lenp = len;
198		return (ret);
199	}
200
201	/* We have to copy the current buffered data to the line buffer */
202	for (len = bufrem, off = 0; ; len += bufrem) {
203		/* Make sure there is room for more data */
204		if (grep_lnbufgrow(len + LNBUFBUMP))
205			goto error;
206		memcpy(lnbuf + off, bufpos, len - off);
207		off = len;
208		if (grep_refill(f) != 0)
209			goto error;
210		if (bufrem == 0)
211			/* EOF: return partial line */
212			break;
213		if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
214			continue;
215		/* got it: finish up the line (like code above) */
216		++p;
217		diff = p - bufpos;
218		len += diff;
219		if (grep_lnbufgrow(len))
220		    goto error;
221		memcpy(lnbuf + off, bufpos, diff);
222		bufrem -= diff;
223		bufpos = p;
224		break;
225	}
226	*lenp = len;
227	return (lnbuf);
228
229error:
230	*lenp = 0;
231	return (NULL);
232}
233
234/*
235 * Opens a file for processing.
236 */
237struct file *
238grep_open(const char *path)
239{
240	struct file *f;
241
242	f = grep_malloc(sizeof *f);
243	memset(f, 0, sizeof *f);
244	if (path == NULL) {
245		/* Processing stdin implies --line-buffered. */
246		lbflag = true;
247		f->fd = STDIN_FILENO;
248	} else if ((f->fd = open(path, O_RDONLY)) == -1)
249		goto error1;
250
251	if (filebehave == FILE_MMAP) {
252		struct stat st;
253
254		if ((fstat(f->fd, &st) == -1) || (st.st_size > OFF_MAX) ||
255		    (!S_ISREG(st.st_mode)))
256			filebehave = FILE_STDIO;
257		else {
258			int flags = MAP_PRIVATE | MAP_NOCORE | MAP_NOSYNC;
259#ifdef MAP_PREFAULT_READ
260			flags |= MAP_PREFAULT_READ;
261#endif
262			fsiz = st.st_size;
263			buffer = mmap(NULL, fsiz, PROT_READ, flags,
264			     f->fd, (off_t)0);
265			if (buffer == MAP_FAILED)
266				filebehave = FILE_STDIO;
267			else {
268				bufrem = st.st_size;
269				bufpos = buffer;
270				madvise(buffer, st.st_size, MADV_SEQUENTIAL);
271			}
272		}
273	}
274
275	if ((buffer == NULL) || (buffer == MAP_FAILED))
276		buffer = grep_malloc(MAXBUFSIZ);
277
278	if (filebehave == FILE_GZIP &&
279	    (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
280		goto error2;
281
282#ifndef WITHOUT_BZIP2
283	if (filebehave == FILE_BZIP &&
284	    (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
285		goto error2;
286#endif
287
288	/* Fill read buffer, also catches errors early */
289	if (bufrem == 0 && grep_refill(f) != 0)
290		goto error2;
291
292	/* Check for binary stuff, if necessary */
293	if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
294	f->binary = true;
295
296	return (f);
297
298error2:
299	close(f->fd);
300error1:
301	free(f);
302	return (NULL);
303}
304
305/*
306 * Closes a file.
307 */
308void
309grep_close(struct file *f)
310{
311
312	close(f->fd);
313
314	/* Reset read buffer and line buffer */
315	if (filebehave == FILE_MMAP) {
316		munmap(buffer, fsiz);
317		buffer = NULL;
318	}
319	bufpos = buffer;
320	bufrem = 0;
321
322	free(lnbuf);
323	lnbuf = NULL;
324	lnbuflen = 0;
325}
326