csplit.c revision 100697
195926Stjr/*- 295926Stjr * Copyright (c) 2002 Tim J. Robbins. 395926Stjr * All rights reserved. 495926Stjr * 595926Stjr * Redistribution and use in source and binary forms, with or without 695926Stjr * modification, are permitted provided that the following conditions 795926Stjr * are met: 895926Stjr * 1. Redistributions of source code must retain the above copyright 995926Stjr * notice, this list of conditions and the following disclaimer. 1095926Stjr * 2. Redistributions in binary form must reproduce the above copyright 1195926Stjr * notice, this list of conditions and the following disclaimer in the 1295926Stjr * documentation and/or other materials provided with the distribution. 1395926Stjr * 1495926Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 1595926Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1695926Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1795926Stjr * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 1895926Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1995926Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2095926Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2195926Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2295926Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2395926Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2495926Stjr * SUCH DAMAGE. 2595926Stjr */ 2695926Stjr 2795926Stjr/* 2895926Stjr * csplit -- split files based on context 2995926Stjr * 3095926Stjr * This utility splits its input into numbered output files by line number 3195926Stjr * or by a regular expression. Regular expression matches have an optional 3295926Stjr * offset with them, allowing the split to occur a specified number of 3395926Stjr * lines before or after the match. 3495926Stjr * 3595926Stjr * To handle negative offsets, we stop reading when the match occurs and 3695926Stjr * store the offset that the file should have been split at, then use 3795926Stjr * this output file as input until all the "overflowed" lines have been read. 3895926Stjr * The file is then closed and truncated to the correct length. 3995926Stjr * 4095926Stjr * We assume that the output files can be seeked upon (ie. they cannot be 4195926Stjr * symlinks to named pipes or character devices), but make no such 4295926Stjr * assumption about the input. 4395926Stjr */ 4495926Stjr 4595926Stjr#include <sys/cdefs.h> 4695926Stjr__FBSDID("$FreeBSD: head/usr.bin/csplit/csplit.c 100697 2002-07-26 05:25:12Z tjr $"); 4795926Stjr 4895926Stjr#include <sys/types.h> 4995926Stjr 5095926Stjr#include <ctype.h> 5195926Stjr#include <err.h> 5295926Stjr#include <errno.h> 5395926Stjr#include <limits.h> 5497977Stjr#include <locale.h> 5595926Stjr#include <regex.h> 5695926Stjr#include <signal.h> 5795926Stjr#include <stdint.h> 5895926Stjr#include <stdio.h> 5995926Stjr#include <stdlib.h> 6095926Stjr#include <string.h> 6195926Stjr#include <unistd.h> 6295926Stjr 6395926Stjrvoid cleanup(void); 6495926Stjrvoid do_lineno(const char *); 6595926Stjrvoid do_rexp(const char *); 6695926Stjrchar *getline(void); 6795926Stjrvoid handlesig(int); 6895926StjrFILE *newfile(void); 6995926Stjrvoid toomuch(FILE *, long); 7095926Stjrvoid usage(void); 7195926Stjr 7295926Stjr/* 7395926Stjr * Command line options 7495926Stjr */ 7595926Stjrconst char *prefix; /* File name prefix */ 7695926Stjrlong sufflen; /* Number of decimal digits for suffix */ 7795926Stjrint sflag; /* Suppress output of file names */ 7895926Stjrint kflag; /* Keep output if error occurs */ 7995926Stjr 8095926Stjr/* 8195926Stjr * Other miscellaneous globals (XXX too many) 8295926Stjr */ 8395926Stjrlong lineno; /* Current line number in input file */ 8495926Stjrlong reps; /* Number of repetitions for this pattern */ 8595926Stjrlong nfiles; /* Number of files output so far */ 8695926Stjrlong maxfiles; /* Maximum number of files we can create */ 8795926Stjrchar currfile[PATH_MAX]; /* Current output file */ 8895926Stjrconst char *infn; /* Name of the input file */ 8995926StjrFILE *infile; /* Input file handle */ 9095926StjrFILE *overfile; /* Overflow file for toomuch() */ 9195926Stjroff_t truncofs; /* Offset this file should be truncated at */ 9295926Stjrint doclean; /* Should cleanup() remove output? */ 9395926Stjr 9495926Stjrint 9595926Stjrmain(int argc, char *argv[]) 9695926Stjr{ 97100697Stjr struct sigaction sa; 9895926Stjr long i; 9995926Stjr int ch; 10095926Stjr const char *expr; 10195926Stjr char *ep, *p; 10295926Stjr FILE *ofp; 10395926Stjr 10497977Stjr setlocale(LC_ALL, ""); 10597977Stjr 10695926Stjr kflag = sflag = 0; 10795926Stjr prefix = "xx"; 10895926Stjr sufflen = 2; 10995926Stjr while ((ch = getopt(argc, argv, "ksf:n:")) > 0) { 11095926Stjr switch (ch) { 11195926Stjr case 'f': 11295926Stjr prefix = optarg; 11395926Stjr break; 11495926Stjr case 'k': 11595926Stjr kflag = 1; 11695926Stjr break; 11795926Stjr case 'n': 11895926Stjr errno = 0; 11995926Stjr sufflen = strtol(optarg, &ep, 10); 12095926Stjr if (sufflen <= 0 || *ep != '\0' || errno != 0) 12195926Stjr errx(1, "%s: bad suffix length", optarg); 12295926Stjr break; 12395926Stjr case 's': 12495926Stjr sflag = 1; 12595926Stjr break; 12695926Stjr default: 12795926Stjr usage(); 12895926Stjr /*NOTREACHED*/ 12995926Stjr } 13095926Stjr } 13195926Stjr 13295926Stjr if (sufflen + strlen(prefix) >= PATH_MAX) 13395926Stjr errx(1, "name too long"); 13495926Stjr 13595926Stjr argc -= optind; 13695926Stjr argv += optind; 13795926Stjr 13895926Stjr if ((infn = *argv++) == NULL) 13995926Stjr usage(); 14095926Stjr if (strcmp(infn, "-") == 0) { 14195926Stjr infile = stdin; 14295926Stjr infn = "stdin"; 14395926Stjr } else if ((infile = fopen(infn, "r")) == NULL) 14495926Stjr err(1, "%s", infn); 14595926Stjr 14695926Stjr if (!kflag) { 14795926Stjr doclean = 1; 14895926Stjr atexit(cleanup); 149100697Stjr sa.sa_flags = 0; 150100697Stjr sa.sa_handler = handlesig; 151100697Stjr sigemptyset(&sa.sa_mask); 152100697Stjr sigaddset(&sa.sa_mask, SIGHUP); 153100697Stjr sigaddset(&sa.sa_mask, SIGINT); 154100697Stjr sigaddset(&sa.sa_mask, SIGTERM); 155100697Stjr sigaction(SIGHUP, &sa, NULL); 156100697Stjr sigaction(SIGINT, &sa, NULL); 157100697Stjr sigaction(SIGTERM, &sa, NULL); 15895926Stjr } 15995926Stjr 16095926Stjr lineno = 0; 16195926Stjr nfiles = 0; 16295926Stjr truncofs = 0; 16395926Stjr overfile = NULL; 16495926Stjr 16595926Stjr /* Ensure 10^sufflen < LONG_MAX. */ 16695926Stjr for (maxfiles = 1, i = 0; i < sufflen; i++) { 16795926Stjr if (maxfiles > LONG_MAX / 10) 16895926Stjr errx(1, "%ld: suffix too long (limit %ld)", 16995926Stjr sufflen, i); 17095926Stjr maxfiles *= 10; 17195926Stjr } 17295926Stjr 17395926Stjr /* Create files based on supplied patterns. */ 17495926Stjr while (nfiles < maxfiles - 1 && (expr = *argv++) != NULL) { 17595926Stjr /* Look ahead & see if this pattern has any repetitions. */ 17695926Stjr if (*argv != NULL && **argv == '{') { 17795926Stjr errno = 0; 17895926Stjr reps = strtol(*argv + 1, &ep, 10); 17995926Stjr if (reps < 0 || *ep != '}' || errno != 0) 18095926Stjr errx(1, "%s: bad repetition count", *argv + 1); 18195926Stjr argv++; 18295926Stjr } else 18395926Stjr reps = 0; 18495926Stjr 18595926Stjr if (*expr == '/' || *expr == '%') { 18695926Stjr do 18795926Stjr do_rexp(expr); 18895926Stjr while (reps-- != 0 && nfiles < maxfiles - 1); 18995926Stjr } else if (isdigit((unsigned char)*expr)) 19095926Stjr do_lineno(expr); 19195926Stjr else 19295926Stjr errx(1, "%s: unrecognised pattern", expr); 19395926Stjr } 19495926Stjr 19595926Stjr /* Copy the rest into a new file. */ 19695926Stjr if (!feof(infile)) { 19795926Stjr ofp = newfile(); 19895926Stjr while ((p = getline()) != NULL && fputs(p, ofp) == 0) 19995926Stjr ; 20095926Stjr if (!sflag) 20195926Stjr printf("%jd\n", (intmax_t)ftello(ofp)); 20295926Stjr if (fclose(ofp) != 0) 20395926Stjr err(1, "%s", currfile); 20495926Stjr } 20595926Stjr 20695926Stjr toomuch(NULL, 0); 20795926Stjr doclean = 0; 20895926Stjr 20995926Stjr return (0); 21095926Stjr} 21195926Stjr 21295926Stjrvoid 21395926Stjrusage(void) 21495926Stjr{ 21595926Stjr 21695926Stjr fprintf(stderr, 21796708Stjr"usage: csplit [-ks] [-f prefix] [-n number] file args ...\n"); 21895926Stjr exit(1); 21995926Stjr} 22095926Stjr 22195926Stjrvoid 22295926Stjrhandlesig(int sig __unused) 22395926Stjr{ 22495926Stjr const char msg[] = "csplit: caught signal, cleaning up\n"; 22595926Stjr 22695926Stjr write(STDERR_FILENO, msg, sizeof(msg) - 1); 22795926Stjr cleanup(); 22895926Stjr _exit(2); 22995926Stjr} 23095926Stjr 23195926Stjr/* Create a new output file. */ 23295926StjrFILE * 23395926Stjrnewfile(void) 23495926Stjr{ 23595926Stjr FILE *fp; 23695926Stjr 23799024Stjr if (snprintf(currfile, sizeof(currfile), "%s%0*ld", prefix, 23899024Stjr (int)sufflen, nfiles) >= sizeof(currfile)) { 23999024Stjr errno = ENAMETOOLONG; 24099024Stjr err(1, NULL); 24199024Stjr } 24295926Stjr if ((fp = fopen(currfile, "w+")) == NULL) 24395926Stjr err(1, "%s", currfile); 24495926Stjr nfiles++; 24595926Stjr 24695926Stjr return (fp); 24795926Stjr} 24895926Stjr 24995926Stjr/* Remove partial output, called before exiting. */ 25095926Stjrvoid 25195926Stjrcleanup(void) 25295926Stjr{ 25395926Stjr char fnbuf[PATH_MAX]; 25495926Stjr long i; 25595926Stjr 25695926Stjr if (!doclean) 25795926Stjr return; 25895926Stjr 25995926Stjr /* 26095926Stjr * NOTE: One cannot portably assume to be able to call snprintf() 26195926Stjr * from inside a signal handler. It does, however, appear to be safe 26295926Stjr * to do on FreeBSD. The solution to this problem is worse than the 26395926Stjr * problem itself. 26495926Stjr */ 26595926Stjr 26695926Stjr for (i = 0; i < nfiles; i++) { 26795926Stjr snprintf(fnbuf, sizeof(fnbuf), "%s%0*ld", prefix, 26895926Stjr (int)sufflen, i); 26995926Stjr unlink(fnbuf); 27095926Stjr } 27195926Stjr} 27295926Stjr 27395926Stjr/* Read a line from the input into a static buffer. */ 27495926Stjrchar * 27595926Stjrgetline(void) 27695926Stjr{ 27795926Stjr static char lbuf[LINE_MAX]; 27895926Stjr FILE *src; 27995926Stjr 28095926Stjr src = overfile != NULL ? overfile : infile; 28195926Stjr 28295926Stjragain: if (fgets(lbuf, sizeof(lbuf), src) == NULL) { 28395926Stjr if (src == overfile) { 28495926Stjr src = infile; 28595926Stjr goto again; 28695926Stjr } 28795926Stjr return (NULL); 28895926Stjr } 28995926Stjr if (ferror(src)) 29095926Stjr err(1, "%s", infn); 29195926Stjr lineno++; 29295926Stjr 29395926Stjr return (lbuf); 29495926Stjr} 29595926Stjr 29695926Stjr/* Conceptually rewind the input (as obtained by getline()) back `n' lines. */ 29795926Stjrvoid 29895926Stjrtoomuch(FILE *ofp, long n) 29995926Stjr{ 30095926Stjr char buf[BUFSIZ]; 30195926Stjr size_t i, nread; 30295926Stjr 30395926Stjr if (overfile != NULL) { 30495926Stjr /* 30595926Stjr * Truncate the previous file we overflowed into back to 30695926Stjr * the correct length, close it. 30795926Stjr */ 30895926Stjr if (fflush(overfile) != 0) 30995926Stjr err(1, "overflow"); 31095926Stjr if (ftruncate(fileno(overfile), truncofs) != 0) 31195926Stjr err(1, "overflow"); 31295926Stjr if (fclose(overfile) != 0) 31395926Stjr err(1, "overflow"); 31495926Stjr overfile = NULL; 31595926Stjr } 31695926Stjr 31795926Stjr if (n == 0) 31895926Stjr /* Just tidying up */ 31995926Stjr return; 32095926Stjr 32195926Stjr lineno -= n; 32295926Stjr 32395926Stjr /* 32495926Stjr * Wind the overflow file backwards to `n' lines before the 32595926Stjr * current one. 32695926Stjr */ 32795926Stjr do { 32895926Stjr if (ftello(ofp) < (off_t)sizeof(buf)) 32995926Stjr rewind(ofp); 33095926Stjr else 33195926Stjr fseek(ofp, -(long)sizeof(buf), SEEK_CUR); 33295926Stjr if (ferror(ofp)) 33395926Stjr errx(1, "%s: can't seek", currfile); 33495926Stjr if ((nread = fread(buf, 1, sizeof(buf), ofp)) == 0) 33595926Stjr errx(1, "can't read overflowed output"); 33695926Stjr if (fseek(ofp, -(long)nread, SEEK_CUR) != 0) 33795926Stjr err(1, "%s", currfile); 33895926Stjr for (i = 1; i <= nread; i++) 33995926Stjr if (buf[nread - i] == '\n' && n-- == 0) 34095926Stjr break; 34196710Stjr if (ftello(ofp) == 0) 34296710Stjr break; 34395926Stjr } while (n > 0); 34495926Stjr if (fseek(ofp, nread - i + 1, SEEK_CUR) != 0) 34595926Stjr err(1, "%s", currfile); 34695926Stjr 34795926Stjr /* 34895926Stjr * getline() will read from here. Next call will truncate to 34995926Stjr * truncofs in this file. 35095926Stjr */ 35195926Stjr overfile = ofp; 35295926Stjr truncofs = ftello(overfile); 35395926Stjr} 35495926Stjr 35595926Stjr/* Handle splits for /regexp/ and %regexp% patterns. */ 35695926Stjrvoid 35795926Stjrdo_rexp(const char *expr) 35895926Stjr{ 35995926Stjr regex_t cre; 36095926Stjr intmax_t nwritten; 36195926Stjr long ofs; 36295926Stjr int first; 36395926Stjr char *ecopy, *ep, *p, *pofs, *re; 36495926Stjr FILE *ofp; 36595926Stjr 36695926Stjr if ((ecopy = strdup(expr)) == NULL) 36795926Stjr err(1, "strdup"); 36895926Stjr 36995926Stjr re = ecopy + 1; 37095926Stjr if ((pofs = strrchr(ecopy, *expr)) == NULL || pofs[-1] == '\\') 37195926Stjr errx(1, "%s: missing trailing %c", expr, *expr); 37295926Stjr *pofs++ = '\0'; 37395926Stjr 37495926Stjr if (*pofs != '\0') { 37595926Stjr errno = 0; 37695926Stjr ofs = strtol(pofs, &ep, 10); 37795926Stjr if (*ep != '\0' || errno != 0) 37895926Stjr errx(1, "%s: bad offset", pofs); 37995926Stjr } else 38095926Stjr ofs = 0; 38195926Stjr 38295926Stjr if (regcomp(&cre, re, REG_BASIC|REG_NOSUB) != 0) 38395926Stjr errx(1, "%s: bad regular expression", re); 38495926Stjr 38595926Stjr if (*expr == '/') 38695926Stjr /* /regexp/: Save results to a file. */ 38795926Stjr ofp = newfile(); 38895926Stjr else { 38995926Stjr /* %regexp%: Make a temporary file for overflow. */ 39095926Stjr if ((ofp = tmpfile()) == NULL) 39195926Stjr err(1, "tmpfile"); 39295926Stjr } 39395926Stjr 39495926Stjr /* Read and output lines until we get a match. */ 39595926Stjr first = 1; 39695926Stjr while ((p = getline()) != NULL) { 39795926Stjr if (fputs(p, ofp) != 0) 39895926Stjr break; 39995926Stjr if (!first && regexec(&cre, p, 0, NULL, 0) == 0) 40095926Stjr break; 40195926Stjr first = 0; 40295926Stjr } 40395926Stjr 40495926Stjr if (p == NULL) 40595926Stjr errx(1, "%s: no match", re); 40695926Stjr 40795926Stjr if (ofs <= 0) { 40895926Stjr /* 40995926Stjr * Negative (or zero) offset: throw back any lines we should 41095926Stjr * not have read yet. 41195926Stjr */ 41295926Stjr if (p != NULL) { 41395926Stjr toomuch(ofp, -ofs + 1); 41495926Stjr nwritten = (intmax_t)truncofs; 41595926Stjr } else 41695926Stjr nwritten = (intmax_t)ftello(ofp); 41795926Stjr } else { 41895926Stjr /* 41995926Stjr * Positive offset: copy the requested number of lines 42095926Stjr * after the match. 42195926Stjr */ 42295926Stjr while (--ofs > 0 && (p = getline()) != NULL) 42395926Stjr fputs(p, ofp); 42495926Stjr toomuch(NULL, 0); 42595926Stjr nwritten = (intmax_t)ftello(ofp); 42695926Stjr if (fclose(ofp) != 0) 42795926Stjr err(1, "%s", currfile); 42895926Stjr } 42995926Stjr 43095926Stjr if (!sflag && *expr == '/') 43195926Stjr printf("%jd\n", nwritten); 43295926Stjr 43395926Stjr regfree(&cre); 43495926Stjr free(ecopy); 43595926Stjr} 43695926Stjr 43795926Stjr/* Handle splits based on line number. */ 43895926Stjrvoid 43995926Stjrdo_lineno(const char *expr) 44095926Stjr{ 44195926Stjr long lastline, tgtline; 44295926Stjr char *ep, *p; 44395926Stjr FILE *ofp; 44495926Stjr 44595926Stjr errno = 0; 44695926Stjr tgtline = strtol(expr, &ep, 10); 44795926Stjr if (tgtline <= 0 || errno != 0 || *ep != '\0') 44895926Stjr errx(1, "%s: bad line number", expr); 44995926Stjr lastline = tgtline; 45095926Stjr if (lastline <= lineno) 45195926Stjr errx(1, "%s: can't go backwards", expr); 45295926Stjr 45395926Stjr while (nfiles < maxfiles - 1) { 45495926Stjr ofp = newfile(); 45595926Stjr while (lineno + 1 != lastline) { 45695926Stjr if ((p = getline()) == NULL) 45795926Stjr errx(1, "%ld: out of range", lastline); 45895926Stjr if (fputs(p, ofp) != 0) 45995926Stjr break; 46095926Stjr } 46195926Stjr if (!sflag) 46295926Stjr printf("%jd\n", (intmax_t)ftello(ofp)); 46395926Stjr if (fclose(ofp) != 0) 46495926Stjr err(1, "%s", currfile); 46595926Stjr if (reps-- == 0) 46695926Stjr break; 46795926Stjr lastline += tgtline; 46895926Stjr } 46995926Stjr} 470