195926Stjr/*- 295926Stjr * Copyright (c) 2002 Tim J. Robbins. 395926Stjr * All rights reserved. 495926Stjr * 595926Stjr * Redistribution and use in source and binary forms, with or without 695926Stjr * modification, are permitted provided that the following conditions 795926Stjr * are met: 895926Stjr * 1. Redistributions of source code must retain the above copyright 995926Stjr * notice, this list of conditions and the following disclaimer. 1095926Stjr * 2. Redistributions in binary form must reproduce the above copyright 1195926Stjr * notice, this list of conditions and the following disclaimer in the 1295926Stjr * documentation and/or other materials provided with the distribution. 1395926Stjr * 1495926Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 1595926Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1695926Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1795926Stjr * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 1895926Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1995926Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2095926Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2195926Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2295926Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2395926Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2495926Stjr * SUCH DAMAGE. 2595926Stjr */ 2695926Stjr 2795926Stjr/* 2895926Stjr * csplit -- split files based on context 2995926Stjr * 3095926Stjr * This utility splits its input into numbered output files by line number 3195926Stjr * or by a regular expression. Regular expression matches have an optional 3295926Stjr * offset with them, allowing the split to occur a specified number of 3395926Stjr * lines before or after the match. 3495926Stjr * 3595926Stjr * To handle negative offsets, we stop reading when the match occurs and 3695926Stjr * store the offset that the file should have been split at, then use 3795926Stjr * this output file as input until all the "overflowed" lines have been read. 3895926Stjr * The file is then closed and truncated to the correct length. 3995926Stjr * 4095926Stjr * We assume that the output files can be seeked upon (ie. they cannot be 4195926Stjr * symlinks to named pipes or character devices), but make no such 4295926Stjr * assumption about the input. 4395926Stjr */ 4495926Stjr 4595926Stjr#include <sys/cdefs.h> 4695926Stjr__FBSDID("$FreeBSD$"); 4795926Stjr 4895926Stjr#include <sys/types.h> 4995926Stjr 5095926Stjr#include <ctype.h> 5195926Stjr#include <err.h> 5295926Stjr#include <errno.h> 5395926Stjr#include <limits.h> 5497977Stjr#include <locale.h> 5595926Stjr#include <regex.h> 5695926Stjr#include <signal.h> 5795926Stjr#include <stdint.h> 5895926Stjr#include <stdio.h> 5995926Stjr#include <stdlib.h> 6095926Stjr#include <string.h> 6195926Stjr#include <unistd.h> 6295926Stjr 63227161Sedstatic void cleanup(void); 64227161Sedstatic void do_lineno(const char *); 65227161Sedstatic void do_rexp(const char *); 66227161Sedstatic char *getline(void); 67227161Sedstatic void handlesig(int); 68227161Sedstatic FILE *newfile(void); 69227161Sedstatic void toomuch(FILE *, long); 70227161Sedstatic void usage(void); 7195926Stjr 7295926Stjr/* 7395926Stjr * Command line options 7495926Stjr */ 75227161Sedstatic const char *prefix; /* File name prefix */ 76227161Sedstatic long sufflen; /* Number of decimal digits for suffix */ 77227161Sedstatic int sflag; /* Suppress output of file names */ 78227161Sedstatic int kflag; /* Keep output if error occurs */ 7995926Stjr 8095926Stjr/* 8195926Stjr * Other miscellaneous globals (XXX too many) 8295926Stjr */ 83227161Sedstatic long lineno; /* Current line number in input file */ 84227161Sedstatic long reps; /* Number of repetitions for this pattern */ 85227161Sedstatic long nfiles; /* Number of files output so far */ 86227161Sedstatic long maxfiles; /* Maximum number of files we can create */ 87227161Sedstatic char currfile[PATH_MAX]; /* Current output file */ 88227161Sedstatic const char *infn; /* Name of the input file */ 89227161Sedstatic FILE *infile; /* Input file handle */ 90227161Sedstatic FILE *overfile; /* Overflow file for toomuch() */ 91227161Sedstatic off_t truncofs; /* Offset this file should be truncated at */ 92227161Sedstatic int doclean; /* Should cleanup() remove output? */ 9395926Stjr 9495926Stjrint 9595926Stjrmain(int argc, char *argv[]) 9695926Stjr{ 97100697Stjr struct sigaction sa; 9895926Stjr long i; 9995926Stjr int ch; 10095926Stjr const char *expr; 10195926Stjr char *ep, *p; 10295926Stjr FILE *ofp; 10395926Stjr 10497977Stjr setlocale(LC_ALL, ""); 10597977Stjr 10695926Stjr kflag = sflag = 0; 10795926Stjr prefix = "xx"; 10895926Stjr sufflen = 2; 10995926Stjr while ((ch = getopt(argc, argv, "ksf:n:")) > 0) { 11095926Stjr switch (ch) { 11195926Stjr case 'f': 11295926Stjr prefix = optarg; 11395926Stjr break; 11495926Stjr case 'k': 11595926Stjr kflag = 1; 11695926Stjr break; 11795926Stjr case 'n': 11895926Stjr errno = 0; 11995926Stjr sufflen = strtol(optarg, &ep, 10); 12095926Stjr if (sufflen <= 0 || *ep != '\0' || errno != 0) 12195926Stjr errx(1, "%s: bad suffix length", optarg); 12295926Stjr break; 12395926Stjr case 's': 12495926Stjr sflag = 1; 12595926Stjr break; 12695926Stjr default: 12795926Stjr usage(); 12895926Stjr /*NOTREACHED*/ 12995926Stjr } 13095926Stjr } 13195926Stjr 13295926Stjr if (sufflen + strlen(prefix) >= PATH_MAX) 13395926Stjr errx(1, "name too long"); 13495926Stjr 13595926Stjr argc -= optind; 13695926Stjr argv += optind; 13795926Stjr 13895926Stjr if ((infn = *argv++) == NULL) 13995926Stjr usage(); 14095926Stjr if (strcmp(infn, "-") == 0) { 14195926Stjr infile = stdin; 14295926Stjr infn = "stdin"; 14395926Stjr } else if ((infile = fopen(infn, "r")) == NULL) 14495926Stjr err(1, "%s", infn); 14595926Stjr 14695926Stjr if (!kflag) { 14795926Stjr doclean = 1; 14895926Stjr atexit(cleanup); 149100697Stjr sa.sa_flags = 0; 150100697Stjr sa.sa_handler = handlesig; 151100697Stjr sigemptyset(&sa.sa_mask); 152100697Stjr sigaddset(&sa.sa_mask, SIGHUP); 153100697Stjr sigaddset(&sa.sa_mask, SIGINT); 154100697Stjr sigaddset(&sa.sa_mask, SIGTERM); 155100697Stjr sigaction(SIGHUP, &sa, NULL); 156100697Stjr sigaction(SIGINT, &sa, NULL); 157100697Stjr sigaction(SIGTERM, &sa, NULL); 15895926Stjr } 15995926Stjr 16095926Stjr lineno = 0; 16195926Stjr nfiles = 0; 16295926Stjr truncofs = 0; 16395926Stjr overfile = NULL; 16495926Stjr 16595926Stjr /* Ensure 10^sufflen < LONG_MAX. */ 16695926Stjr for (maxfiles = 1, i = 0; i < sufflen; i++) { 16795926Stjr if (maxfiles > LONG_MAX / 10) 16895926Stjr errx(1, "%ld: suffix too long (limit %ld)", 16995926Stjr sufflen, i); 17095926Stjr maxfiles *= 10; 17195926Stjr } 17295926Stjr 17395926Stjr /* Create files based on supplied patterns. */ 17495926Stjr while (nfiles < maxfiles - 1 && (expr = *argv++) != NULL) { 17595926Stjr /* Look ahead & see if this pattern has any repetitions. */ 17695926Stjr if (*argv != NULL && **argv == '{') { 17795926Stjr errno = 0; 17895926Stjr reps = strtol(*argv + 1, &ep, 10); 17995926Stjr if (reps < 0 || *ep != '}' || errno != 0) 18095926Stjr errx(1, "%s: bad repetition count", *argv + 1); 18195926Stjr argv++; 18295926Stjr } else 18395926Stjr reps = 0; 18495926Stjr 18595926Stjr if (*expr == '/' || *expr == '%') { 18695926Stjr do 18795926Stjr do_rexp(expr); 18895926Stjr while (reps-- != 0 && nfiles < maxfiles - 1); 18995926Stjr } else if (isdigit((unsigned char)*expr)) 19095926Stjr do_lineno(expr); 19195926Stjr else 19295926Stjr errx(1, "%s: unrecognised pattern", expr); 19395926Stjr } 19495926Stjr 19595926Stjr /* Copy the rest into a new file. */ 19695926Stjr if (!feof(infile)) { 19795926Stjr ofp = newfile(); 19895926Stjr while ((p = getline()) != NULL && fputs(p, ofp) == 0) 19995926Stjr ; 20095926Stjr if (!sflag) 20195926Stjr printf("%jd\n", (intmax_t)ftello(ofp)); 20295926Stjr if (fclose(ofp) != 0) 20395926Stjr err(1, "%s", currfile); 20495926Stjr } 20595926Stjr 20695926Stjr toomuch(NULL, 0); 20795926Stjr doclean = 0; 20895926Stjr 20995926Stjr return (0); 21095926Stjr} 21195926Stjr 212227161Sedstatic void 21395926Stjrusage(void) 21495926Stjr{ 21595926Stjr 21695926Stjr fprintf(stderr, 21796708Stjr"usage: csplit [-ks] [-f prefix] [-n number] file args ...\n"); 21895926Stjr exit(1); 21995926Stjr} 22095926Stjr 221227161Sedstatic void 22295926Stjrhandlesig(int sig __unused) 22395926Stjr{ 22495926Stjr const char msg[] = "csplit: caught signal, cleaning up\n"; 22595926Stjr 22695926Stjr write(STDERR_FILENO, msg, sizeof(msg) - 1); 22795926Stjr cleanup(); 22895926Stjr _exit(2); 22995926Stjr} 23095926Stjr 23195926Stjr/* Create a new output file. */ 232227161Sedstatic FILE * 23395926Stjrnewfile(void) 23495926Stjr{ 23595926Stjr FILE *fp; 23695926Stjr 237100821Sdwmalone if ((size_t)snprintf(currfile, sizeof(currfile), "%s%0*ld", prefix, 238127298Stjr (int)sufflen, nfiles) >= sizeof(currfile)) 239127298Stjr errc(1, ENAMETOOLONG, NULL); 24095926Stjr if ((fp = fopen(currfile, "w+")) == NULL) 24195926Stjr err(1, "%s", currfile); 24295926Stjr nfiles++; 24395926Stjr 24495926Stjr return (fp); 24595926Stjr} 24695926Stjr 24795926Stjr/* Remove partial output, called before exiting. */ 248227161Sedstatic void 24995926Stjrcleanup(void) 25095926Stjr{ 25195926Stjr char fnbuf[PATH_MAX]; 25295926Stjr long i; 25395926Stjr 25495926Stjr if (!doclean) 25595926Stjr return; 25695926Stjr 25795926Stjr /* 25895926Stjr * NOTE: One cannot portably assume to be able to call snprintf() 25995926Stjr * from inside a signal handler. It does, however, appear to be safe 26095926Stjr * to do on FreeBSD. The solution to this problem is worse than the 26195926Stjr * problem itself. 26295926Stjr */ 26395926Stjr 26495926Stjr for (i = 0; i < nfiles; i++) { 26595926Stjr snprintf(fnbuf, sizeof(fnbuf), "%s%0*ld", prefix, 26695926Stjr (int)sufflen, i); 26795926Stjr unlink(fnbuf); 26895926Stjr } 26995926Stjr} 27095926Stjr 27195926Stjr/* Read a line from the input into a static buffer. */ 272227161Sedstatic char * 27395926Stjrgetline(void) 27495926Stjr{ 27595926Stjr static char lbuf[LINE_MAX]; 27695926Stjr FILE *src; 27795926Stjr 27895926Stjr src = overfile != NULL ? overfile : infile; 27995926Stjr 28095926Stjragain: if (fgets(lbuf, sizeof(lbuf), src) == NULL) { 28195926Stjr if (src == overfile) { 28295926Stjr src = infile; 28395926Stjr goto again; 28495926Stjr } 28595926Stjr return (NULL); 28695926Stjr } 28795926Stjr if (ferror(src)) 28895926Stjr err(1, "%s", infn); 28995926Stjr lineno++; 29095926Stjr 29195926Stjr return (lbuf); 29295926Stjr} 29395926Stjr 29495926Stjr/* Conceptually rewind the input (as obtained by getline()) back `n' lines. */ 295227161Sedstatic void 29695926Stjrtoomuch(FILE *ofp, long n) 29795926Stjr{ 29895926Stjr char buf[BUFSIZ]; 29995926Stjr size_t i, nread; 30095926Stjr 30195926Stjr if (overfile != NULL) { 30295926Stjr /* 30395926Stjr * Truncate the previous file we overflowed into back to 30495926Stjr * the correct length, close it. 30595926Stjr */ 30695926Stjr if (fflush(overfile) != 0) 30795926Stjr err(1, "overflow"); 30895926Stjr if (ftruncate(fileno(overfile), truncofs) != 0) 30995926Stjr err(1, "overflow"); 31095926Stjr if (fclose(overfile) != 0) 31195926Stjr err(1, "overflow"); 31295926Stjr overfile = NULL; 31395926Stjr } 31495926Stjr 31595926Stjr if (n == 0) 31695926Stjr /* Just tidying up */ 31795926Stjr return; 31895926Stjr 31995926Stjr lineno -= n; 32095926Stjr 32195926Stjr /* 32295926Stjr * Wind the overflow file backwards to `n' lines before the 32395926Stjr * current one. 32495926Stjr */ 32595926Stjr do { 32695926Stjr if (ftello(ofp) < (off_t)sizeof(buf)) 32795926Stjr rewind(ofp); 32895926Stjr else 329127300Stjr fseeko(ofp, -(off_t)sizeof(buf), SEEK_CUR); 33095926Stjr if (ferror(ofp)) 33195926Stjr errx(1, "%s: can't seek", currfile); 33295926Stjr if ((nread = fread(buf, 1, sizeof(buf), ofp)) == 0) 33395926Stjr errx(1, "can't read overflowed output"); 334127300Stjr if (fseeko(ofp, -(off_t)nread, SEEK_CUR) != 0) 33595926Stjr err(1, "%s", currfile); 33695926Stjr for (i = 1; i <= nread; i++) 33795926Stjr if (buf[nread - i] == '\n' && n-- == 0) 33895926Stjr break; 33996710Stjr if (ftello(ofp) == 0) 34096710Stjr break; 34195926Stjr } while (n > 0); 342127300Stjr if (fseeko(ofp, nread - i + 1, SEEK_CUR) != 0) 34395926Stjr err(1, "%s", currfile); 34495926Stjr 34595926Stjr /* 34695926Stjr * getline() will read from here. Next call will truncate to 34795926Stjr * truncofs in this file. 34895926Stjr */ 34995926Stjr overfile = ofp; 35095926Stjr truncofs = ftello(overfile); 35195926Stjr} 35295926Stjr 35395926Stjr/* Handle splits for /regexp/ and %regexp% patterns. */ 354227161Sedstatic void 35595926Stjrdo_rexp(const char *expr) 35695926Stjr{ 35795926Stjr regex_t cre; 35895926Stjr intmax_t nwritten; 35995926Stjr long ofs; 36095926Stjr int first; 36195926Stjr char *ecopy, *ep, *p, *pofs, *re; 36295926Stjr FILE *ofp; 36395926Stjr 36495926Stjr if ((ecopy = strdup(expr)) == NULL) 36595926Stjr err(1, "strdup"); 36695926Stjr 36795926Stjr re = ecopy + 1; 36895926Stjr if ((pofs = strrchr(ecopy, *expr)) == NULL || pofs[-1] == '\\') 36995926Stjr errx(1, "%s: missing trailing %c", expr, *expr); 37095926Stjr *pofs++ = '\0'; 37195926Stjr 37295926Stjr if (*pofs != '\0') { 37395926Stjr errno = 0; 37495926Stjr ofs = strtol(pofs, &ep, 10); 37595926Stjr if (*ep != '\0' || errno != 0) 37695926Stjr errx(1, "%s: bad offset", pofs); 37795926Stjr } else 37895926Stjr ofs = 0; 37995926Stjr 38095926Stjr if (regcomp(&cre, re, REG_BASIC|REG_NOSUB) != 0) 38195926Stjr errx(1, "%s: bad regular expression", re); 38295926Stjr 38395926Stjr if (*expr == '/') 38495926Stjr /* /regexp/: Save results to a file. */ 38595926Stjr ofp = newfile(); 38695926Stjr else { 38795926Stjr /* %regexp%: Make a temporary file for overflow. */ 38895926Stjr if ((ofp = tmpfile()) == NULL) 38995926Stjr err(1, "tmpfile"); 39095926Stjr } 39195926Stjr 39295926Stjr /* Read and output lines until we get a match. */ 39395926Stjr first = 1; 39495926Stjr while ((p = getline()) != NULL) { 39595926Stjr if (fputs(p, ofp) != 0) 39695926Stjr break; 39795926Stjr if (!first && regexec(&cre, p, 0, NULL, 0) == 0) 39895926Stjr break; 39995926Stjr first = 0; 40095926Stjr } 40195926Stjr 40295926Stjr if (p == NULL) 40395926Stjr errx(1, "%s: no match", re); 40495926Stjr 40595926Stjr if (ofs <= 0) { 40695926Stjr /* 40795926Stjr * Negative (or zero) offset: throw back any lines we should 40895926Stjr * not have read yet. 40995926Stjr */ 41095926Stjr if (p != NULL) { 41195926Stjr toomuch(ofp, -ofs + 1); 41295926Stjr nwritten = (intmax_t)truncofs; 41395926Stjr } else 41495926Stjr nwritten = (intmax_t)ftello(ofp); 41595926Stjr } else { 41695926Stjr /* 41795926Stjr * Positive offset: copy the requested number of lines 41895926Stjr * after the match. 41995926Stjr */ 42095926Stjr while (--ofs > 0 && (p = getline()) != NULL) 42195926Stjr fputs(p, ofp); 42295926Stjr toomuch(NULL, 0); 42395926Stjr nwritten = (intmax_t)ftello(ofp); 42495926Stjr if (fclose(ofp) != 0) 42595926Stjr err(1, "%s", currfile); 42695926Stjr } 42795926Stjr 42895926Stjr if (!sflag && *expr == '/') 42995926Stjr printf("%jd\n", nwritten); 43095926Stjr 43195926Stjr regfree(&cre); 43295926Stjr free(ecopy); 43395926Stjr} 43495926Stjr 43595926Stjr/* Handle splits based on line number. */ 436227161Sedstatic void 43795926Stjrdo_lineno(const char *expr) 43895926Stjr{ 43995926Stjr long lastline, tgtline; 44095926Stjr char *ep, *p; 44195926Stjr FILE *ofp; 44295926Stjr 44395926Stjr errno = 0; 44495926Stjr tgtline = strtol(expr, &ep, 10); 44595926Stjr if (tgtline <= 0 || errno != 0 || *ep != '\0') 44695926Stjr errx(1, "%s: bad line number", expr); 44795926Stjr lastline = tgtline; 44895926Stjr if (lastline <= lineno) 44995926Stjr errx(1, "%s: can't go backwards", expr); 45095926Stjr 45195926Stjr while (nfiles < maxfiles - 1) { 45295926Stjr ofp = newfile(); 45395926Stjr while (lineno + 1 != lastline) { 45495926Stjr if ((p = getline()) == NULL) 45595926Stjr errx(1, "%ld: out of range", lastline); 45695926Stjr if (fputs(p, ofp) != 0) 45795926Stjr break; 45895926Stjr } 45995926Stjr if (!sflag) 46095926Stjr printf("%jd\n", (intmax_t)ftello(ofp)); 46195926Stjr if (fclose(ofp) != 0) 46295926Stjr err(1, "%s", currfile); 46395926Stjr if (reps-- == 0) 46495926Stjr break; 46595926Stjr lastline += tgtline; 46695926Stjr } 46795926Stjr} 468