1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1987, 1993, 1994
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32#include <sys/param.h>
33#include <sys/stat.h>
34
35#include <ctype.h>
36#include <err.h>
37#include <errno.h>
38#include <fcntl.h>
39#include <inttypes.h>
40#include <libutil.h>
41#include <limits.h>
42#include <locale.h>
43#include <stdbool.h>
44#include <stdint.h>
45#include <stdio.h>
46#include <stdlib.h>
47#include <string.h>
48#include <unistd.h>
49#include <regex.h>
50#include <sysexits.h>
51
52#define DEFLINE	1000			/* Default num lines per file. */
53
54static off_t	 bytecnt;		/* Byte count to split on. */
55static long	 chunks;		/* Chunks count to split into. */
56static bool      clobber = true;        /* Whether to overwrite existing output files. */
57static long	 numlines;		/* Line count to split on. */
58static int	 file_open;		/* If a file open. */
59static int	 ifd = -1, ofd = -1;	/* Input/output file descriptors. */
60static char	 fname[MAXPATHLEN];	/* File name prefix. */
61static regex_t	 rgx;
62static int	 pflag;
63static bool	 dflag;
64static long	 sufflen = 2;		/* File name suffix length. */
65static bool	 autosfx = true;	/* Whether to auto-extend the suffix length. */
66
67static void newfile(void);
68static void split1(void);
69static void split2(void);
70static void split3(void);
71static void usage(void) __dead2;
72
73int
74main(int argc, char **argv)
75{
76	char errbuf[64];
77	const char *p, *errstr;
78	int ch, error;
79
80	setlocale(LC_ALL, "");
81
82	dflag = false;
83	while ((ch = getopt(argc, argv, "0::1::2::3::4::5::6::7::8::9::a:b:cdl:n:p:")) != -1)
84		switch (ch) {
85		case '0': case '1': case '2': case '3': case '4':
86		case '5': case '6': case '7': case '8': case '9':
87			/*
88			 * Undocumented kludge: split was originally designed
89			 * to take a number after a dash.
90			 */
91			if (numlines != 0)
92				usage();
93			numlines = ch - '0';
94			p = optarg ? optarg : "";
95			while (numlines >= 0 && *p >= '0' && *p <= '9')
96				numlines = numlines * 10 + *p++ - '0';
97			if (numlines <= 0 || *p != '\0')
98				errx(EX_USAGE, "%c%s: line count is invalid",
99				    ch, optarg ? optarg : "");
100			break;
101		case 'a':		/* Suffix length */
102			sufflen = strtonum(optarg, 0, INT_MAX, &errstr);
103			if (errstr != NULL) {
104				errx(EX_USAGE, "%s: suffix length is %s",
105				    optarg, errstr);
106			}
107			if (sufflen == 0) {
108				sufflen = 2;
109				autosfx = true;
110			} else {
111				autosfx = false;
112			}
113			break;
114		case 'b':		/* Byte count. */
115			if (expand_number(optarg, &bytecnt) != 0) {
116				errx(EX_USAGE, "%s: byte count is invalid",
117				    optarg);
118			}
119			break;
120		case 'c':               /* Continue, don't overwrite output files. */
121			clobber = false;
122			break;
123		case 'd':		/* Decimal suffix */
124			dflag = true;
125			break;
126		case 'l':		/* Line count. */
127			if (numlines != 0)
128				usage();
129			numlines = strtonum(optarg, 1, LONG_MAX, &errstr);
130			if (errstr != NULL) {
131				errx(EX_USAGE, "%s: line count is %s",
132				    optarg, errstr);
133			}
134			break;
135		case 'n':		/* Chunks. */
136			chunks = strtonum(optarg, 1, LONG_MAX, &errstr);
137			if (errstr != NULL) {
138				errx(EX_USAGE, "%s: number of chunks is %s",
139				    optarg, errstr);
140			}
141			break;
142
143		case 'p':		/* pattern matching. */
144			error = regcomp(&rgx, optarg, REG_EXTENDED|REG_NOSUB);
145			if (error != 0) {
146				regerror(error, &rgx, errbuf, sizeof(errbuf));
147				errx(EX_USAGE, "%s: regex is invalid: %s",
148				    optarg, errbuf);
149			}
150			pflag = 1;
151			break;
152		default:
153			usage();
154		}
155	argv += optind;
156	argc -= optind;
157
158	if (argc > 0) {			/* Input file. */
159		if (strcmp(*argv, "-") == 0)
160			ifd = STDIN_FILENO;
161		else if ((ifd = open(*argv, O_RDONLY, 0)) < 0)
162			err(EX_NOINPUT, "%s", *argv);
163		++argv;
164		--argc;
165	}
166	if (argc > 0) {			/* File name prefix. */
167		if (strlcpy(fname, *argv, sizeof(fname)) >= sizeof(fname)) {
168			errx(EX_USAGE, "%s: file name prefix is too long",
169			    *argv);
170		}
171		++argv;
172		--argc;
173	}
174	if (argc > 0)
175		usage();
176
177	if (strlen(fname) + (unsigned long)sufflen >= sizeof(fname))
178		errx(EX_USAGE, "suffix is too long");
179	if (pflag && (numlines != 0 || bytecnt != 0 || chunks != 0))
180		usage();
181
182	if (numlines == 0)
183		numlines = DEFLINE;
184	else if (bytecnt != 0 || chunks != 0)
185		usage();
186
187	if (bytecnt != 0 && chunks != 0)
188		usage();
189
190	if (ifd == -1)				/* Stdin by default. */
191		ifd = 0;
192
193	if (bytecnt != 0) {
194		split1();
195		exit (0);
196	} else if (chunks != 0) {
197		split3();
198		exit (0);
199	}
200	split2();
201	if (pflag)
202		regfree(&rgx);
203	exit(0);
204}
205
206/*
207 * split1 --
208 *	Split the input by bytes.
209 */
210static void
211split1(void)
212{
213	static char bfr[MAXBSIZE];
214	off_t bcnt;
215	char *C;
216	ssize_t dist, len;
217	int nfiles;
218
219	nfiles = 0;
220
221	for (bcnt = 0;;)
222		switch ((len = read(ifd, bfr, sizeof(bfr)))) {
223		case 0:
224			exit(0);
225		case -1:
226			err(EX_IOERR, "read");
227			/* NOTREACHED */
228		default:
229			if (!file_open) {
230				if (chunks == 0 || nfiles < chunks) {
231					newfile();
232					nfiles++;
233				}
234			}
235			if (bcnt + len >= bytecnt) {
236				dist = bytecnt - bcnt;
237				if (write(ofd, bfr, dist) != dist)
238					err(EX_IOERR, "write");
239				len -= dist;
240				for (C = bfr + dist; len >= bytecnt;
241				     len -= bytecnt, C += bytecnt) {
242					if (chunks == 0 || nfiles < chunks) {
243						newfile();
244						nfiles++;
245					}
246					if (write(ofd, C, bytecnt) != bytecnt)
247						err(EX_IOERR, "write");
248				}
249				if (len != 0) {
250					if (chunks == 0 || nfiles < chunks) {
251						newfile();
252						nfiles++;
253					}
254					if (write(ofd, C, len) != len)
255						err(EX_IOERR, "write");
256				} else {
257					file_open = 0;
258				}
259				bcnt = len;
260			} else {
261				bcnt += len;
262				if (write(ofd, bfr, len) != len)
263					err(EX_IOERR, "write");
264			}
265		}
266}
267
268/*
269 * split2 --
270 *	Split the input by lines.
271 */
272static void
273split2(void)
274{
275	char *buf;
276	size_t bufsize;
277	ssize_t len;
278	long lcnt = 0;
279	FILE *infp;
280
281	buf = NULL;
282	bufsize = 0;
283
284	/* Stick a stream on top of input file descriptor */
285	if ((infp = fdopen(ifd, "r")) == NULL)
286		err(EX_NOINPUT, "fdopen");
287
288	/* Process input one line at a time */
289	while ((errno = 0, len = getline(&buf, &bufsize, infp)) > 0) {
290		/* Check if we need to start a new file */
291		if (pflag) {
292			regmatch_t pmatch;
293
294			pmatch.rm_so = 0;
295			pmatch.rm_eo = len - 1;
296			if (regexec(&rgx, buf, 0, &pmatch, REG_STARTEND) == 0)
297				newfile();
298		} else if (lcnt++ == numlines) {
299			newfile();
300			lcnt = 1;
301		}
302
303		/* Open output file if needed */
304		if (!file_open)
305			newfile();
306
307		/* Write out line */
308		if (write(ofd, buf, len) != len)
309			err(EX_IOERR, "write");
310	}
311
312	/* EOF or error? */
313	if ((len == -1 && errno != 0) || ferror(infp))
314		err(EX_IOERR, "read");
315	else
316		exit(0);
317}
318
319/*
320 * split3 --
321 *	Split the input into specified number of chunks
322 */
323static void
324split3(void)
325{
326	struct stat sb;
327
328	if (fstat(ifd, &sb) == -1) {
329		err(1, "stat");
330		/* NOTREACHED */
331	}
332
333	if (chunks > sb.st_size) {
334		errx(1, "can't split into more than %d files",
335		    (int)sb.st_size);
336		/* NOTREACHED */
337	}
338
339	bytecnt = sb.st_size / chunks;
340	split1();
341}
342
343
344/*
345 * newfile --
346 *	Open a new output file.
347 */
348static void
349newfile(void)
350{
351	long i, maxfiles, tfnum;
352	static long fnum;
353	static char *fpnt;
354	char beg, end;
355	int pattlen;
356	int flags = O_WRONLY | O_CREAT | O_TRUNC;
357
358	if (!clobber)
359		flags |= O_EXCL;
360
361	if (ofd == -1) {
362		if (fname[0] == '\0') {
363			fname[0] = 'x';
364			fpnt = fname + 1;
365		} else {
366			fpnt = fname + strlen(fname);
367		}
368	} else if (close(ofd) != 0)
369		err(1, "%s", fname);
370
371	again:
372	if (dflag) {
373		beg = '0';
374		end = '9';
375	}
376	else {
377		beg = 'a';
378		end = 'z';
379	}
380	pattlen = end - beg + 1;
381
382	/*
383	 * If '-a' is not specified, then we automatically expand the
384	 * suffix length to accomodate splitting all input.  We do this
385	 * by moving the suffix pointer (fpnt) forward and incrementing
386	 * sufflen by one, thereby yielding an additional two characters
387	 * and allowing all output files to sort such that 'cat *' yields
388	 * the input in order.  I.e., the order is '... xyy xyz xzaaa
389	 * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on.
390	 */
391	if (!dflag && autosfx && (fpnt[0] == 'y') &&
392			strspn(fpnt+1, "z") == strlen(fpnt+1)) {
393		/* Ensure the generated filenames will fit into the buffer. */
394		if (strlen(fname) + 2 >= sizeof(fname))
395			errx(EX_USAGE, "combined filenames would be too long");
396
397		fpnt = fname + strlen(fname) - sufflen;
398		fpnt[sufflen + 2] = '\0';
399		fpnt[0] = end;
400		fpnt[1] = beg;
401
402		/*  Basename | Suffix
403		 *  before:
404		 *  x        | yz
405		 *  after:
406		 *  xz       | a.. */
407		fpnt++;
408		sufflen++;
409
410		/* Reset so we start back at all 'a's in our extended suffix. */
411		fnum = 0;
412	}
413
414	/* maxfiles = pattlen^sufflen, but don't use libm. */
415	for (maxfiles = 1, i = 0; i < sufflen; i++)
416		if (LONG_MAX / pattlen < maxfiles)
417			errx(EX_USAGE, "suffix is too long (max %ld)", i);
418		else
419			maxfiles *= pattlen;
420
421	if (fnum == maxfiles)
422		errx(EX_DATAERR, "too many files");
423
424	/* Generate suffix of sufflen letters */
425	tfnum = fnum;
426	i = sufflen - 1;
427	do {
428		fpnt[i] = tfnum % pattlen + beg;
429		tfnum /= pattlen;
430	} while (i-- > 0);
431	fpnt[sufflen] = '\0';
432
433	++fnum;
434	if ((ofd = open(fname, flags, DEFFILEMODE)) < 0) {
435		if (!clobber && errno == EEXIST)
436			goto again;
437		err(EX_IOERR, "%s", fname);
438	}
439	file_open = 1;
440}
441
442static void
443usage(void)
444{
445	(void)fprintf(stderr,
446"usage: split [-cd] [-l line_count] [-a suffix_length] [file [prefix]]\n"
447"       split [-cd] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n"
448"       split [-cd] -n chunk_count [-a suffix_length] [file [prefix]]\n"
449"       split [-cd] -p pattern [-a suffix_length] [file [prefix]]\n");
450	exit(EX_USAGE);
451}
452