195926Stjr/*-
295926Stjr * Copyright (c) 2002 Tim J. Robbins.
395926Stjr * All rights reserved.
495926Stjr *
595926Stjr * Redistribution and use in source and binary forms, with or without
695926Stjr * modification, are permitted provided that the following conditions
795926Stjr * are met:
895926Stjr * 1. Redistributions of source code must retain the above copyright
995926Stjr *    notice, this list of conditions and the following disclaimer.
1095926Stjr * 2. Redistributions in binary form must reproduce the above copyright
1195926Stjr *    notice, this list of conditions and the following disclaimer in the
1295926Stjr *    documentation and/or other materials provided with the distribution.
1395926Stjr *
1495926Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1595926Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1695926Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1795926Stjr * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1895926Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1995926Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2095926Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2195926Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2295926Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2395926Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2495926Stjr * SUCH DAMAGE.
2595926Stjr */
2695926Stjr
2795926Stjr/*
2895926Stjr * csplit -- split files based on context
2995926Stjr *
3095926Stjr * This utility splits its input into numbered output files by line number
3195926Stjr * or by a regular expression. Regular expression matches have an optional
3295926Stjr * offset with them, allowing the split to occur a specified number of
3395926Stjr * lines before or after the match.
3495926Stjr *
3595926Stjr * To handle negative offsets, we stop reading when the match occurs and
3695926Stjr * store the offset that the file should have been split at, then use
3795926Stjr * this output file as input until all the "overflowed" lines have been read.
3895926Stjr * The file is then closed and truncated to the correct length.
3995926Stjr *
4095926Stjr * We assume that the output files can be seeked upon (ie. they cannot be
4195926Stjr * symlinks to named pipes or character devices), but make no such
4295926Stjr * assumption about the input.
4395926Stjr */
4495926Stjr
4595926Stjr#include <sys/cdefs.h>
4695926Stjr__FBSDID("$FreeBSD$");
4795926Stjr
4895926Stjr#include <sys/types.h>
4995926Stjr
5095926Stjr#include <ctype.h>
5195926Stjr#include <err.h>
5295926Stjr#include <errno.h>
5395926Stjr#include <limits.h>
5497977Stjr#include <locale.h>
5595926Stjr#include <regex.h>
5695926Stjr#include <signal.h>
5795926Stjr#include <stdint.h>
5895926Stjr#include <stdio.h>
5995926Stjr#include <stdlib.h>
6095926Stjr#include <string.h>
6195926Stjr#include <unistd.h>
6295926Stjr
63227161Sedstatic void	 cleanup(void);
64227161Sedstatic void	 do_lineno(const char *);
65227161Sedstatic void	 do_rexp(const char *);
66227161Sedstatic char	*getline(void);
67227161Sedstatic void	 handlesig(int);
68227161Sedstatic FILE	*newfile(void);
69227161Sedstatic void	 toomuch(FILE *, long);
70227161Sedstatic void	 usage(void);
7195926Stjr
7295926Stjr/*
7395926Stjr * Command line options
7495926Stjr */
75227161Sedstatic const char *prefix;	/* File name prefix */
76227161Sedstatic long	 sufflen;	/* Number of decimal digits for suffix */
77227161Sedstatic int	 sflag;		/* Suppress output of file names */
78227161Sedstatic int	 kflag;		/* Keep output if error occurs */
7995926Stjr
8095926Stjr/*
8195926Stjr * Other miscellaneous globals (XXX too many)
8295926Stjr */
83227161Sedstatic long	 lineno;	/* Current line number in input file */
84227161Sedstatic long	 reps;		/* Number of repetitions for this pattern */
85227161Sedstatic long	 nfiles;	/* Number of files output so far */
86227161Sedstatic long	 maxfiles;	/* Maximum number of files we can create */
87227161Sedstatic char	 currfile[PATH_MAX]; /* Current output file */
88227161Sedstatic const char *infn;	/* Name of the input file */
89227161Sedstatic FILE	*infile;	/* Input file handle */
90227161Sedstatic FILE	*overfile;	/* Overflow file for toomuch() */
91227161Sedstatic off_t	 truncofs;	/* Offset this file should be truncated at */
92227161Sedstatic int	 doclean;	/* Should cleanup() remove output? */
9395926Stjr
9495926Stjrint
9595926Stjrmain(int argc, char *argv[])
9695926Stjr{
97100697Stjr	struct sigaction sa;
9895926Stjr	long i;
9995926Stjr	int ch;
10095926Stjr	const char *expr;
10195926Stjr	char *ep, *p;
10295926Stjr	FILE *ofp;
10395926Stjr
10497977Stjr	setlocale(LC_ALL, "");
10597977Stjr
10695926Stjr	kflag = sflag = 0;
10795926Stjr	prefix = "xx";
10895926Stjr	sufflen = 2;
10995926Stjr	while ((ch = getopt(argc, argv, "ksf:n:")) > 0) {
11095926Stjr		switch (ch) {
11195926Stjr		case 'f':
11295926Stjr			prefix = optarg;
11395926Stjr			break;
11495926Stjr		case 'k':
11595926Stjr			kflag = 1;
11695926Stjr			break;
11795926Stjr		case 'n':
11895926Stjr			errno = 0;
11995926Stjr			sufflen = strtol(optarg, &ep, 10);
12095926Stjr			if (sufflen <= 0 || *ep != '\0' || errno != 0)
12195926Stjr				errx(1, "%s: bad suffix length", optarg);
12295926Stjr			break;
12395926Stjr		case 's':
12495926Stjr			sflag = 1;
12595926Stjr			break;
12695926Stjr		default:
12795926Stjr			usage();
12895926Stjr			/*NOTREACHED*/
12995926Stjr		}
13095926Stjr	}
13195926Stjr
13295926Stjr	if (sufflen + strlen(prefix) >= PATH_MAX)
13395926Stjr		errx(1, "name too long");
13495926Stjr
13595926Stjr	argc -= optind;
13695926Stjr	argv += optind;
13795926Stjr
13895926Stjr	if ((infn = *argv++) == NULL)
13995926Stjr		usage();
14095926Stjr	if (strcmp(infn, "-") == 0) {
14195926Stjr		infile = stdin;
14295926Stjr		infn = "stdin";
14395926Stjr	} else if ((infile = fopen(infn, "r")) == NULL)
14495926Stjr		err(1, "%s", infn);
14595926Stjr
14695926Stjr	if (!kflag) {
14795926Stjr		doclean = 1;
14895926Stjr		atexit(cleanup);
149100697Stjr		sa.sa_flags = 0;
150100697Stjr		sa.sa_handler = handlesig;
151100697Stjr		sigemptyset(&sa.sa_mask);
152100697Stjr		sigaddset(&sa.sa_mask, SIGHUP);
153100697Stjr		sigaddset(&sa.sa_mask, SIGINT);
154100697Stjr		sigaddset(&sa.sa_mask, SIGTERM);
155100697Stjr		sigaction(SIGHUP, &sa, NULL);
156100697Stjr		sigaction(SIGINT, &sa, NULL);
157100697Stjr		sigaction(SIGTERM, &sa, NULL);
15895926Stjr	}
15995926Stjr
16095926Stjr	lineno = 0;
16195926Stjr	nfiles = 0;
16295926Stjr	truncofs = 0;
16395926Stjr	overfile = NULL;
16495926Stjr
16595926Stjr	/* Ensure 10^sufflen < LONG_MAX. */
16695926Stjr	for (maxfiles = 1, i = 0; i < sufflen; i++) {
16795926Stjr		if (maxfiles > LONG_MAX / 10)
16895926Stjr			errx(1, "%ld: suffix too long (limit %ld)",
16995926Stjr			    sufflen, i);
17095926Stjr		maxfiles *= 10;
17195926Stjr	}
17295926Stjr
17395926Stjr	/* Create files based on supplied patterns. */
17495926Stjr	while (nfiles < maxfiles - 1 && (expr = *argv++) != NULL) {
17595926Stjr		/* Look ahead & see if this pattern has any repetitions. */
17695926Stjr		if (*argv != NULL && **argv == '{') {
17795926Stjr			errno = 0;
17895926Stjr			reps = strtol(*argv + 1, &ep, 10);
17995926Stjr			if (reps < 0 || *ep != '}' || errno != 0)
18095926Stjr				errx(1, "%s: bad repetition count", *argv + 1);
18195926Stjr			argv++;
18295926Stjr		} else
18395926Stjr			reps = 0;
18495926Stjr
18595926Stjr		if (*expr == '/' || *expr == '%') {
18695926Stjr			do
18795926Stjr				do_rexp(expr);
18895926Stjr			while (reps-- != 0 && nfiles < maxfiles - 1);
18995926Stjr		} else if (isdigit((unsigned char)*expr))
19095926Stjr			do_lineno(expr);
19195926Stjr		else
19295926Stjr			errx(1, "%s: unrecognised pattern", expr);
19395926Stjr	}
19495926Stjr
19595926Stjr	/* Copy the rest into a new file. */
19695926Stjr	if (!feof(infile)) {
19795926Stjr		ofp = newfile();
19895926Stjr		while ((p = getline()) != NULL && fputs(p, ofp) == 0)
19995926Stjr			;
20095926Stjr		if (!sflag)
20195926Stjr			printf("%jd\n", (intmax_t)ftello(ofp));
20295926Stjr		if (fclose(ofp) != 0)
20395926Stjr			err(1, "%s", currfile);
20495926Stjr	}
20595926Stjr
20695926Stjr	toomuch(NULL, 0);
20795926Stjr	doclean = 0;
20895926Stjr
20995926Stjr	return (0);
21095926Stjr}
21195926Stjr
212227161Sedstatic void
21395926Stjrusage(void)
21495926Stjr{
21595926Stjr
21695926Stjr	fprintf(stderr,
21796708Stjr"usage: csplit [-ks] [-f prefix] [-n number] file args ...\n");
21895926Stjr	exit(1);
21995926Stjr}
22095926Stjr
221227161Sedstatic void
22295926Stjrhandlesig(int sig __unused)
22395926Stjr{
22495926Stjr	const char msg[] = "csplit: caught signal, cleaning up\n";
22595926Stjr
22695926Stjr	write(STDERR_FILENO, msg, sizeof(msg) - 1);
22795926Stjr	cleanup();
22895926Stjr	_exit(2);
22995926Stjr}
23095926Stjr
23195926Stjr/* Create a new output file. */
232227161Sedstatic FILE *
23395926Stjrnewfile(void)
23495926Stjr{
23595926Stjr	FILE *fp;
23695926Stjr
237100821Sdwmalone	if ((size_t)snprintf(currfile, sizeof(currfile), "%s%0*ld", prefix,
238127298Stjr	    (int)sufflen, nfiles) >= sizeof(currfile))
239127298Stjr		errc(1, ENAMETOOLONG, NULL);
24095926Stjr	if ((fp = fopen(currfile, "w+")) == NULL)
24195926Stjr		err(1, "%s", currfile);
24295926Stjr	nfiles++;
24395926Stjr
24495926Stjr	return (fp);
24595926Stjr}
24695926Stjr
24795926Stjr/* Remove partial output, called before exiting. */
248227161Sedstatic void
24995926Stjrcleanup(void)
25095926Stjr{
25195926Stjr	char fnbuf[PATH_MAX];
25295926Stjr	long i;
25395926Stjr
25495926Stjr	if (!doclean)
25595926Stjr		return;
25695926Stjr
25795926Stjr	/*
25895926Stjr	 * NOTE: One cannot portably assume to be able to call snprintf()
25995926Stjr	 * from inside a signal handler. It does, however, appear to be safe
26095926Stjr	 * to do on FreeBSD. The solution to this problem is worse than the
26195926Stjr	 * problem itself.
26295926Stjr	 */
26395926Stjr
26495926Stjr	for (i = 0; i < nfiles; i++) {
26595926Stjr		snprintf(fnbuf, sizeof(fnbuf), "%s%0*ld", prefix,
26695926Stjr		    (int)sufflen, i);
26795926Stjr		unlink(fnbuf);
26895926Stjr	}
26995926Stjr}
27095926Stjr
27195926Stjr/* Read a line from the input into a static buffer. */
272227161Sedstatic char *
27395926Stjrgetline(void)
27495926Stjr{
27595926Stjr	static char lbuf[LINE_MAX];
27695926Stjr	FILE *src;
27795926Stjr
27895926Stjr	src = overfile != NULL ? overfile : infile;
27995926Stjr
28095926Stjragain: if (fgets(lbuf, sizeof(lbuf), src) == NULL) {
28195926Stjr		if (src == overfile) {
28295926Stjr			src = infile;
28395926Stjr			goto again;
28495926Stjr		}
28595926Stjr		return (NULL);
28695926Stjr	}
28795926Stjr	if (ferror(src))
28895926Stjr		err(1, "%s", infn);
28995926Stjr	lineno++;
29095926Stjr
29195926Stjr	return (lbuf);
29295926Stjr}
29395926Stjr
29495926Stjr/* Conceptually rewind the input (as obtained by getline()) back `n' lines. */
295227161Sedstatic void
29695926Stjrtoomuch(FILE *ofp, long n)
29795926Stjr{
29895926Stjr	char buf[BUFSIZ];
29995926Stjr	size_t i, nread;
30095926Stjr
30195926Stjr	if (overfile != NULL) {
30295926Stjr		/*
30395926Stjr		 * Truncate the previous file we overflowed into back to
30495926Stjr		 * the correct length, close it.
30595926Stjr		 */
30695926Stjr		if (fflush(overfile) != 0)
30795926Stjr			err(1, "overflow");
30895926Stjr		if (ftruncate(fileno(overfile), truncofs) != 0)
30995926Stjr			err(1, "overflow");
31095926Stjr		if (fclose(overfile) != 0)
31195926Stjr			err(1, "overflow");
31295926Stjr		overfile = NULL;
31395926Stjr	}
31495926Stjr
31595926Stjr	if (n == 0)
31695926Stjr		/* Just tidying up */
31795926Stjr		return;
31895926Stjr
31995926Stjr	lineno -= n;
32095926Stjr
32195926Stjr	/*
32295926Stjr	 * Wind the overflow file backwards to `n' lines before the
32395926Stjr	 * current one.
32495926Stjr	 */
32595926Stjr	do {
32695926Stjr		if (ftello(ofp) < (off_t)sizeof(buf))
32795926Stjr			rewind(ofp);
32895926Stjr		else
329127300Stjr			fseeko(ofp, -(off_t)sizeof(buf), SEEK_CUR);
33095926Stjr		if (ferror(ofp))
33195926Stjr			errx(1, "%s: can't seek", currfile);
33295926Stjr		if ((nread = fread(buf, 1, sizeof(buf), ofp)) == 0)
33395926Stjr			errx(1, "can't read overflowed output");
334127300Stjr		if (fseeko(ofp, -(off_t)nread, SEEK_CUR) != 0)
33595926Stjr			err(1, "%s", currfile);
33695926Stjr		for (i = 1; i <= nread; i++)
33795926Stjr			if (buf[nread - i] == '\n' && n-- == 0)
33895926Stjr				break;
33996710Stjr		if (ftello(ofp) == 0)
34096710Stjr			break;
34195926Stjr	} while (n > 0);
342127300Stjr	if (fseeko(ofp, nread - i + 1, SEEK_CUR) != 0)
34395926Stjr		err(1, "%s", currfile);
34495926Stjr
34595926Stjr	/*
34695926Stjr	 * getline() will read from here. Next call will truncate to
34795926Stjr	 * truncofs in this file.
34895926Stjr	 */
34995926Stjr	overfile = ofp;
35095926Stjr	truncofs = ftello(overfile);
35195926Stjr}
35295926Stjr
35395926Stjr/* Handle splits for /regexp/ and %regexp% patterns. */
354227161Sedstatic void
35595926Stjrdo_rexp(const char *expr)
35695926Stjr{
35795926Stjr	regex_t cre;
35895926Stjr	intmax_t nwritten;
35995926Stjr	long ofs;
36095926Stjr	int first;
36195926Stjr	char *ecopy, *ep, *p, *pofs, *re;
36295926Stjr	FILE *ofp;
36395926Stjr
36495926Stjr	if ((ecopy = strdup(expr)) == NULL)
36595926Stjr		err(1, "strdup");
36695926Stjr
36795926Stjr	re = ecopy + 1;
36895926Stjr	if ((pofs = strrchr(ecopy, *expr)) == NULL || pofs[-1] == '\\')
36995926Stjr		errx(1, "%s: missing trailing %c", expr, *expr);
37095926Stjr	*pofs++ = '\0';
37195926Stjr
37295926Stjr	if (*pofs != '\0') {
37395926Stjr		errno = 0;
37495926Stjr		ofs = strtol(pofs, &ep, 10);
37595926Stjr		if (*ep != '\0' || errno != 0)
37695926Stjr			errx(1, "%s: bad offset", pofs);
37795926Stjr	} else
37895926Stjr		ofs = 0;
37995926Stjr
38095926Stjr	if (regcomp(&cre, re, REG_BASIC|REG_NOSUB) != 0)
38195926Stjr		errx(1, "%s: bad regular expression", re);
38295926Stjr
38395926Stjr	if (*expr == '/')
38495926Stjr		/* /regexp/: Save results to a file. */
38595926Stjr		ofp = newfile();
38695926Stjr	else {
38795926Stjr		/* %regexp%: Make a temporary file for overflow. */
38895926Stjr		if ((ofp = tmpfile()) == NULL)
38995926Stjr			err(1, "tmpfile");
39095926Stjr	}
39195926Stjr
39295926Stjr	/* Read and output lines until we get a match. */
39395926Stjr	first = 1;
39495926Stjr	while ((p = getline()) != NULL) {
39595926Stjr		if (fputs(p, ofp) != 0)
39695926Stjr			break;
39795926Stjr		if (!first && regexec(&cre, p, 0, NULL, 0) == 0)
39895926Stjr			break;
39995926Stjr		first = 0;
40095926Stjr	}
40195926Stjr
40295926Stjr	if (p == NULL)
40395926Stjr		errx(1, "%s: no match", re);
40495926Stjr
40595926Stjr	if (ofs <= 0) {
40695926Stjr		/*
40795926Stjr		 * Negative (or zero) offset: throw back any lines we should
40895926Stjr		 * not have read yet.
40995926Stjr		  */
41095926Stjr		if (p != NULL) {
41195926Stjr			toomuch(ofp, -ofs + 1);
41295926Stjr			nwritten = (intmax_t)truncofs;
41395926Stjr		} else
41495926Stjr			nwritten = (intmax_t)ftello(ofp);
41595926Stjr	} else {
41695926Stjr		/*
41795926Stjr		 * Positive offset: copy the requested number of lines
41895926Stjr		 * after the match.
41995926Stjr		 */
42095926Stjr		while (--ofs > 0 && (p = getline()) != NULL)
42195926Stjr			fputs(p, ofp);
42295926Stjr		toomuch(NULL, 0);
42395926Stjr		nwritten = (intmax_t)ftello(ofp);
42495926Stjr		if (fclose(ofp) != 0)
42595926Stjr			err(1, "%s", currfile);
42695926Stjr	}
42795926Stjr
42895926Stjr	if (!sflag && *expr == '/')
42995926Stjr		printf("%jd\n", nwritten);
43095926Stjr
43195926Stjr	regfree(&cre);
43295926Stjr	free(ecopy);
43395926Stjr}
43495926Stjr
43595926Stjr/* Handle splits based on line number. */
436227161Sedstatic void
43795926Stjrdo_lineno(const char *expr)
43895926Stjr{
43995926Stjr	long lastline, tgtline;
44095926Stjr	char *ep, *p;
44195926Stjr	FILE *ofp;
44295926Stjr
44395926Stjr	errno = 0;
44495926Stjr	tgtline = strtol(expr, &ep, 10);
44595926Stjr	if (tgtline <= 0 || errno != 0 || *ep != '\0')
44695926Stjr		errx(1, "%s: bad line number", expr);
44795926Stjr	lastline = tgtline;
44895926Stjr	if (lastline <= lineno)
44995926Stjr		errx(1, "%s: can't go backwards", expr);
45095926Stjr
45195926Stjr	while (nfiles < maxfiles - 1) {
45295926Stjr		ofp = newfile();
45395926Stjr		while (lineno + 1 != lastline) {
45495926Stjr			if ((p = getline()) == NULL)
45595926Stjr				errx(1, "%ld: out of range", lastline);
45695926Stjr			if (fputs(p, ofp) != 0)
45795926Stjr				break;
45895926Stjr		}
45995926Stjr		if (!sflag)
46095926Stjr			printf("%jd\n", (intmax_t)ftello(ofp));
46195926Stjr		if (fclose(ofp) != 0)
46295926Stjr			err(1, "%s", currfile);
46395926Stjr		if (reps-- == 0)
46495926Stjr			break;
46595926Stjr		lastline += tgtline;
46695926Stjr	}
46795926Stjr}
468