1/*	$NetBSD: file.c,v 1.5 2011/02/16 18:35:39 joerg Exp $	*/
2/*	$FreeBSD$	*/
3/*	$OpenBSD: file.c,v 1.11 2010/07/02 20:48:48 nicm Exp $	*/
4
5/*-
6 * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
7 * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
8 * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD$");
35
36#include <sys/param.h>
37#include <sys/mman.h>
38#include <sys/stat.h>
39#include <sys/types.h>
40
41#include <err.h>
42#include <errno.h>
43#include <fcntl.h>
44#include <lzma.h>
45#include <stddef.h>
46#include <stdlib.h>
47#include <string.h>
48#include <unistd.h>
49#include <wchar.h>
50#include <wctype.h>
51#include <zlib.h>
52
53#ifndef WITHOUT_BZIP2
54#include <bzlib.h>
55#endif
56
57#include "grep.h"
58
59#define	MAXBUFSIZ	(32 * 1024)
60#define	LNBUFBUMP	80
61
62static gzFile gzbufdesc;
63static lzma_stream lstrm = LZMA_STREAM_INIT;
64#ifndef WITHOUT_BZIP2
65static BZFILE* bzbufdesc;
66#endif
67
68static unsigned char *buffer;
69static unsigned char *bufpos;
70static size_t bufrem;
71static size_t fsiz;
72
73static unsigned char *lnbuf;
74static size_t lnbuflen;
75
76static inline int
77grep_refill(struct file *f)
78{
79	ssize_t nr;
80
81	if (filebehave == FILE_MMAP)
82		return (0);
83
84	bufpos = buffer;
85	bufrem = 0;
86
87	if (filebehave == FILE_GZIP) {
88		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
89#ifndef WITHOUT_BZIP2
90	} else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
91		int bzerr;
92
93		nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
94		switch (bzerr) {
95		case BZ_OK:
96		case BZ_STREAM_END:
97			/* No problem, nr will be okay */
98			break;
99		case BZ_DATA_ERROR_MAGIC:
100			/*
101			 * As opposed to gzread(), which simply returns the
102			 * plain file data, if it is not in the correct
103			 * compressed format, BZ2_bzRead() instead aborts.
104			 *
105			 * So, just restart at the beginning of the file again,
106			 * and use plain reads from now on.
107			 */
108			BZ2_bzReadClose(&bzerr, bzbufdesc);
109			bzbufdesc = NULL;
110			if (lseek(f->fd, 0, SEEK_SET) == -1)
111				return (-1);
112			nr = read(f->fd, buffer, MAXBUFSIZ);
113			break;
114		default:
115			/* Make sure we exit with an error */
116			nr = -1;
117		}
118#endif
119	} else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
120		lzma_action action = LZMA_RUN;
121		uint8_t in_buf[MAXBUFSIZ];
122		lzma_ret ret;
123
124		ret = (filebehave == FILE_XZ) ?
125		    lzma_stream_decoder(&lstrm, UINT64_MAX,
126		    LZMA_CONCATENATED) :
127		    lzma_alone_decoder(&lstrm, UINT64_MAX);
128
129		if (ret != LZMA_OK)
130			return (-1);
131
132		lstrm.next_out = buffer;
133		lstrm.avail_out = MAXBUFSIZ;
134		lstrm.next_in = in_buf;
135		nr = read(f->fd, in_buf, MAXBUFSIZ);
136
137		if (nr < 0)
138			return (-1);
139		else if (nr == 0)
140			action = LZMA_FINISH;
141
142		lstrm.avail_in = nr;
143		ret = lzma_code(&lstrm, action);
144
145		if (ret != LZMA_OK && ret != LZMA_STREAM_END)
146			return (-1);
147		bufrem = MAXBUFSIZ - lstrm.avail_out;
148		return (0);
149	} else
150		nr = read(f->fd, buffer, MAXBUFSIZ);
151
152	if (nr < 0)
153		return (-1);
154
155	bufrem = nr;
156	return (0);
157}
158
159static inline int
160grep_lnbufgrow(size_t newlen)
161{
162
163	if (lnbuflen < newlen) {
164		lnbuf = grep_realloc(lnbuf, newlen);
165		lnbuflen = newlen;
166	}
167
168	return (0);
169}
170
171char *
172grep_fgetln(struct file *f, size_t *lenp)
173{
174	unsigned char *p;
175	char *ret;
176	size_t len;
177	size_t off;
178	ptrdiff_t diff;
179
180	/* Fill the buffer, if necessary */
181	if (bufrem == 0 && grep_refill(f) != 0)
182		goto error;
183
184	if (bufrem == 0) {
185		/* Return zero length to indicate EOF */
186		*lenp = 0;
187		return (bufpos);
188	}
189
190	/* Look for a newline in the remaining part of the buffer */
191	if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
192		++p; /* advance over newline */
193		ret = bufpos;
194		len = p - bufpos;
195		bufrem -= len;
196		bufpos = p;
197		*lenp = len;
198		return (ret);
199	}
200
201	/* We have to copy the current buffered data to the line buffer */
202	for (len = bufrem, off = 0; ; len += bufrem) {
203		/* Make sure there is room for more data */
204		if (grep_lnbufgrow(len + LNBUFBUMP))
205			goto error;
206		memcpy(lnbuf + off, bufpos, len - off);
207		off = len;
208		if (grep_refill(f) != 0)
209			goto error;
210		if (bufrem == 0)
211			/* EOF: return partial line */
212			break;
213		if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
214			continue;
215		/* got it: finish up the line (like code above) */
216		++p;
217		diff = p - bufpos;
218		len += diff;
219		if (grep_lnbufgrow(len))
220		    goto error;
221		memcpy(lnbuf + off, bufpos, diff);
222		bufrem -= diff;
223		bufpos = p;
224		break;
225	}
226	*lenp = len;
227	return (lnbuf);
228
229error:
230	*lenp = 0;
231	return (NULL);
232}
233
234/*
235 * Opens a file for processing.
236 */
237struct file *
238grep_open(const char *path)
239{
240	struct file *f;
241
242	f = grep_malloc(sizeof *f);
243	memset(f, 0, sizeof *f);
244	if (path == NULL) {
245		/* Processing stdin implies --line-buffered. */
246		lbflag = true;
247		f->fd = STDIN_FILENO;
248	} else if ((f->fd = open(path, O_RDONLY)) == -1)
249		goto error1;
250
251	if (filebehave == FILE_MMAP) {
252		struct stat st;
253
254		if ((fstat(f->fd, &st) == -1) || (st.st_size > OFF_MAX) ||
255		    (!S_ISREG(st.st_mode)))
256			filebehave = FILE_STDIO;
257		else {
258			int flags = MAP_PRIVATE | MAP_NOCORE | MAP_NOSYNC;
259#ifdef MAP_PREFAULT_READ
260			flags |= MAP_PREFAULT_READ;
261#endif
262			fsiz = st.st_size;
263			buffer = mmap(NULL, fsiz, PROT_READ, flags,
264			     f->fd, (off_t)0);
265			if (buffer == MAP_FAILED)
266				filebehave = FILE_STDIO;
267			else {
268				bufrem = st.st_size;
269				bufpos = buffer;
270				madvise(buffer, st.st_size, MADV_SEQUENTIAL);
271			}
272		}
273	}
274
275	if ((buffer == NULL) || (buffer == MAP_FAILED))
276		buffer = grep_malloc(MAXBUFSIZ);
277
278	if (filebehave == FILE_GZIP &&
279	    (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
280		goto error2;
281
282#ifndef WITHOUT_BZIP2
283	if (filebehave == FILE_BZIP &&
284	    (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
285		goto error2;
286#endif
287
288	/* Fill read buffer, also catches errors early */
289	if (bufrem == 0 && grep_refill(f) != 0)
290		goto error2;
291
292	/* Check for binary stuff, if necessary */
293	if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
294	f->binary = true;
295
296	return (f);
297
298error2:
299	close(f->fd);
300error1:
301	free(f);
302	return (NULL);
303}
304
305/*
306 * Closes a file.
307 */
308void
309grep_close(struct file *f)
310{
311
312	close(f->fd);
313
314	/* Reset read buffer and line buffer */
315	if (filebehave == FILE_MMAP) {
316		munmap(buffer, fsiz);
317		buffer = NULL;
318	}
319	bufpos = buffer;
320	bufrem = 0;
321
322	free(lnbuf);
323	lnbuf = NULL;
324	lnbuflen = 0;
325}
326