file.c revision 220422
1220422Sgabor/*	$NetBSD: file.c,v 1.5 2011/02/16 18:35:39 joerg Exp $	*/
2220422Sgabor/*	$FreeBSD: head/usr.bin/grep/file.c 220422 2011-04-07 13:03:35Z gabor $	*/
3210389Sgabor/*	$OpenBSD: file.c,v 1.11 2010/07/02 20:48:48 nicm Exp $	*/
4210389Sgabor
5210389Sgabor/*-
6211496Sdes * Copyright (c) 1999 James Howard and Dag-Erling Co��dan Sm��rgrav
7211463Sgabor * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
8211463Sgabor * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
9210389Sgabor * All rights reserved.
10210389Sgabor *
11210389Sgabor * Redistribution and use in source and binary forms, with or without
12210389Sgabor * modification, are permitted provided that the following conditions
13210389Sgabor * are met:
14210389Sgabor * 1. Redistributions of source code must retain the above copyright
15210389Sgabor *    notice, this list of conditions and the following disclaimer.
16210389Sgabor * 2. Redistributions in binary form must reproduce the above copyright
17210389Sgabor *    notice, this list of conditions and the following disclaimer in the
18210389Sgabor *    documentation and/or other materials provided with the distribution.
19210389Sgabor *
20210389Sgabor * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21210389Sgabor * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22210389Sgabor * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23210389Sgabor * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24210389Sgabor * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25210389Sgabor * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26210389Sgabor * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27210389Sgabor * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28210389Sgabor * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29210389Sgabor * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30210389Sgabor * SUCH DAMAGE.
31210389Sgabor */
32210389Sgabor
33210389Sgabor#include <sys/cdefs.h>
34210389Sgabor__FBSDID("$FreeBSD: head/usr.bin/grep/file.c 220422 2011-04-07 13:03:35Z gabor $");
35210389Sgabor
36210389Sgabor#include <sys/param.h>
37210389Sgabor#include <sys/types.h>
38210389Sgabor#include <sys/stat.h>
39210389Sgabor
40210389Sgabor#include <bzlib.h>
41210389Sgabor#include <err.h>
42210389Sgabor#include <errno.h>
43211463Sgabor#include <fcntl.h>
44211463Sgabor#include <stddef.h>
45210389Sgabor#include <stdlib.h>
46210389Sgabor#include <string.h>
47210389Sgabor#include <unistd.h>
48210389Sgabor#include <wchar.h>
49210389Sgabor#include <wctype.h>
50210389Sgabor#include <zlib.h>
51210389Sgabor
52210389Sgabor#include "grep.h"
53210389Sgabor
54211463Sgabor#define	MAXBUFSIZ	(32 * 1024)
55211463Sgabor#define	LNBUFBUMP	80
56210389Sgabor
57211463Sgaborstatic gzFile gzbufdesc;
58211463Sgaborstatic BZFILE* bzbufdesc;
59210389Sgabor
60211463Sgaborstatic unsigned char buffer[MAXBUFSIZ];
61211463Sgaborstatic unsigned char *bufpos;
62211463Sgaborstatic size_t bufrem;
63210389Sgabor
64211463Sgaborstatic unsigned char *lnbuf;
65211463Sgaborstatic size_t lnbuflen;
66210389Sgabor
67211364Sgaborstatic inline int
68211463Sgaborgrep_refill(struct file *f)
69210389Sgabor{
70211463Sgabor	ssize_t nr;
71211463Sgabor	int bzerr;
72210389Sgabor
73211463Sgabor	bufpos = buffer;
74211463Sgabor	bufrem = 0;
75211463Sgabor
76211463Sgabor	if (filebehave == FILE_GZIP)
77211463Sgabor		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
78211463Sgabor	else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
79211463Sgabor		nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
80211463Sgabor		switch (bzerr) {
81211463Sgabor		case BZ_OK:
82211463Sgabor		case BZ_STREAM_END:
83211463Sgabor			/* No problem, nr will be okay */
84211463Sgabor			break;
85211463Sgabor		case BZ_DATA_ERROR_MAGIC:
86211463Sgabor			/*
87211463Sgabor			 * As opposed to gzread(), which simply returns the
88211463Sgabor			 * plain file data, if it is not in the correct
89211463Sgabor			 * compressed format, BZ2_bzRead() instead aborts.
90211463Sgabor			 *
91211463Sgabor			 * So, just restart at the beginning of the file again,
92211463Sgabor			 * and use plain reads from now on.
93211463Sgabor			 */
94211463Sgabor			BZ2_bzReadClose(&bzerr, bzbufdesc);
95211463Sgabor			bzbufdesc = NULL;
96211463Sgabor			if (lseek(f->fd, 0, SEEK_SET) == -1)
97211463Sgabor				return (-1);
98211463Sgabor			nr = read(f->fd, buffer, MAXBUFSIZ);
99211463Sgabor			break;
100211463Sgabor		default:
101211463Sgabor			/* Make sure we exit with an error */
102211463Sgabor			nr = -1;
103211463Sgabor		}
104211463Sgabor	} else
105211463Sgabor		nr = read(f->fd, buffer, MAXBUFSIZ);
106211463Sgabor
107211463Sgabor	if (nr < 0)
108211463Sgabor		return (-1);
109211463Sgabor
110211463Sgabor	bufrem = nr;
111211463Sgabor	return (0);
112210389Sgabor}
113210389Sgabor
114211364Sgaborstatic inline int
115211463Sgaborgrep_lnbufgrow(size_t newlen)
116210389Sgabor{
117210389Sgabor
118211463Sgabor	if (lnbuflen < newlen) {
119211463Sgabor		lnbuf = grep_realloc(lnbuf, newlen);
120211463Sgabor		lnbuflen = newlen;
121210389Sgabor	}
122211463Sgabor
123211463Sgabor	return (0);
124210389Sgabor}
125210389Sgabor
126210389Sgaborchar *
127211463Sgaborgrep_fgetln(struct file *f, size_t *lenp)
128210389Sgabor{
129211463Sgabor	unsigned char *p;
130211463Sgabor	char *ret;
131211463Sgabor	size_t len;
132211463Sgabor	size_t off;
133211463Sgabor	ptrdiff_t diff;
134210389Sgabor
135211463Sgabor	/* Fill the buffer, if necessary */
136211463Sgabor	if (bufrem == 0 && grep_refill(f) != 0)
137211463Sgabor		goto error;
138210389Sgabor
139211463Sgabor	if (bufrem == 0) {
140211463Sgabor		/* Return zero length to indicate EOF */
141211463Sgabor		*lenp = 0;
142211463Sgabor		return (bufpos);
143211463Sgabor	}
144210389Sgabor
145211463Sgabor	/* Look for a newline in the remaining part of the buffer */
146211463Sgabor	if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
147211463Sgabor		++p; /* advance over newline */
148211463Sgabor		ret = bufpos;
149211463Sgabor		len = p - bufpos;
150211463Sgabor		bufrem -= len;
151211463Sgabor		bufpos = p;
152211463Sgabor		*lenp = len;
153211463Sgabor		return (ret);
154210389Sgabor	}
155210389Sgabor
156211463Sgabor	/* We have to copy the current buffered data to the line buffer */
157211463Sgabor	for (len = bufrem, off = 0; ; len += bufrem) {
158211463Sgabor		/* Make sure there is room for more data */
159211463Sgabor		if (grep_lnbufgrow(len + LNBUFBUMP))
160211463Sgabor			goto error;
161211463Sgabor		memcpy(lnbuf + off, bufpos, len - off);
162211463Sgabor		off = len;
163211463Sgabor		if (grep_refill(f) != 0)
164211463Sgabor			goto error;
165211463Sgabor		if (bufrem == 0)
166211463Sgabor			/* EOF: return partial line */
167210389Sgabor			break;
168211463Sgabor		if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
169211463Sgabor			continue;
170211463Sgabor		/* got it: finish up the line (like code above) */
171211463Sgabor		++p;
172211463Sgabor		diff = p - bufpos;
173211463Sgabor		len += diff;
174211463Sgabor		if (grep_lnbufgrow(len))
175211463Sgabor		    goto error;
176211463Sgabor		memcpy(lnbuf + off, bufpos, diff);
177211463Sgabor		bufrem -= diff;
178211463Sgabor		bufpos = p;
179211463Sgabor		break;
180210389Sgabor	}
181211463Sgabor	*lenp = len;
182210389Sgabor	return (lnbuf);
183211463Sgabor
184211463Sgaborerror:
185211463Sgabor	*lenp = 0;
186211463Sgabor	return (NULL);
187210389Sgabor}
188210389Sgabor
189211463Sgaborstatic inline struct file *
190211463Sgaborgrep_file_init(struct file *f)
191210389Sgabor{
192210389Sgabor
193211463Sgabor	if (filebehave == FILE_GZIP &&
194211463Sgabor	    (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
195211463Sgabor		goto error;
196211364Sgabor
197211463Sgabor	if (filebehave == FILE_BZIP &&
198211463Sgabor	    (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
199211463Sgabor		goto error;
200210389Sgabor
201211463Sgabor	/* Fill read buffer, also catches errors early */
202211463Sgabor	if (grep_refill(f) != 0)
203211463Sgabor		goto error;
204210389Sgabor
205211463Sgabor	/* Check for binary stuff, if necessary */
206211463Sgabor	if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
207211463Sgabor		f->binary = true;
208210389Sgabor
209211463Sgabor	return (f);
210211463Sgaborerror:
211211463Sgabor	close(f->fd);
212210389Sgabor	free(f);
213210389Sgabor	return (NULL);
214210389Sgabor}
215210389Sgabor
216210389Sgabor/*
217211463Sgabor * Opens a file for processing.
218210389Sgabor */
219210389Sgaborstruct file *
220210389Sgaborgrep_open(const char *path)
221210389Sgabor{
222210389Sgabor	struct file *f;
223210389Sgabor
224210389Sgabor	f = grep_malloc(sizeof *f);
225211463Sgabor	memset(f, 0, sizeof *f);
226211463Sgabor	if (path == NULL) {
227211463Sgabor		/* Processing stdin implies --line-buffered. */
228211463Sgabor		lbflag = true;
229211463Sgabor		f->fd = STDIN_FILENO;
230211463Sgabor	} else if ((f->fd = open(path, O_RDONLY)) == -1) {
231211463Sgabor		free(f);
232211463Sgabor		return (NULL);
233210389Sgabor	}
234210389Sgabor
235211463Sgabor	return (grep_file_init(f));
236210389Sgabor}
237210389Sgabor
238210389Sgabor/*
239211463Sgabor * Closes a file.
240210389Sgabor */
241210389Sgaborvoid
242210389Sgaborgrep_close(struct file *f)
243210389Sgabor{
244210389Sgabor
245211463Sgabor	close(f->fd);
246210389Sgabor
247211463Sgabor	/* Reset read buffer and line buffer */
248211463Sgabor	bufpos = buffer;
249211463Sgabor	bufrem = 0;
250211463Sgabor
251211463Sgabor	free(lnbuf);
252211463Sgabor	lnbuf = NULL;
253211463Sgabor	lnbuflen = 0;
254210389Sgabor}
255