csplit.c revision 1.2
1/* $NetBSD: csplit.c,v 1.2 2007/07/18 01:32:33 lukem Exp $ */ 2/* $FreeBSD: src/usr.bin/csplit/csplit.c,v 1.9 2004/03/22 11:15:03 tjr Exp$ */ 3 4/*- 5 * Copyright (c) 2002 Tim J. Robbins. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30/* 31 * csplit -- split files based on context 32 * 33 * This utility splits its input into numbered output files by line number 34 * or by a regular expression. Regular expression matches have an optional 35 * offset with them, allowing the split to occur a specified number of 36 * lines before or after the match. 37 * 38 * To handle negative offsets, we stop reading when the match occurs and 39 * store the offset that the file should have been split at, then use 40 * this output file as input until all the "overflowed" lines have been read. 41 * The file is then closed and truncated to the correct length. 42 * 43 * We assume that the output files can be seeked upon (ie. they cannot be 44 * symlinks to named pipes or character devices), but make no such 45 * assumption about the input. 46 */ 47 48#include <sys/cdefs.h> 49#ifndef lint 50__RCSID("$NetBSD: csplit.c,v 1.2 2007/07/18 01:32:33 lukem Exp $"); 51#endif 52 53#include <sys/types.h> 54 55#include <ctype.h> 56#include <err.h> 57#include <errno.h> 58#include <limits.h> 59#include <locale.h> 60#include <regex.h> 61#include <signal.h> 62#include <stdint.h> 63#include <stdio.h> 64#include <stdlib.h> 65#include <string.h> 66#include <unistd.h> 67 68static void cleanup(void); 69static void do_lineno(const char *); 70static void do_rexp(const char *); 71static char *getline(void); 72static void handlesig(int); 73static FILE *newfile(void); 74static void toomuch(FILE *, long); 75static void usage(void) __attribute__((__noreturn__)); 76 77/* 78 * Command line options 79 */ 80const char *prefix; /* File name prefix */ 81long sufflen; /* Number of decimal digits for suffix */ 82int sflag; /* Suppress output of file names */ 83int kflag; /* Keep output if error occurs */ 84 85/* 86 * Other miscellaneous globals (XXX too many) 87 */ 88long lineno; /* Current line number in input file */ 89long reps; /* Number of repetitions for this pattern */ 90long nfiles; /* Number of files output so far */ 91long maxfiles; /* Maximum number of files we can create */ 92char currfile[PATH_MAX]; /* Current output file */ 93const char *infn; /* Name of the input file */ 94FILE *infile; /* Input file handle */ 95FILE *overfile; /* Overflow file for toomuch() */ 96off_t truncofs; /* Offset this file should be truncated at */ 97int doclean; /* Should cleanup() remove output? */ 98 99int 100main(int argc, char *argv[]) 101{ 102 struct sigaction sa; 103 long i; 104 int ch; 105 const char *expr; 106 char *ep, *p; 107 FILE *ofp; 108 109 (void)setlocale(LC_ALL, ""); 110 111 kflag = sflag = 0; 112 prefix = "xx"; 113 sufflen = 2; 114 while ((ch = getopt(argc, argv, "ksf:n:")) > 0) { 115 switch (ch) { 116 case 'f': 117 prefix = optarg; 118 break; 119 case 'k': 120 kflag = 1; 121 break; 122 case 'n': 123 errno = 0; 124 sufflen = strtol(optarg, &ep, 10); 125 if (sufflen <= 0 || *ep != '\0' || errno != 0) 126 errx(1, "%s: bad suffix length", optarg); 127 break; 128 case 's': 129 sflag = 1; 130 break; 131 default: 132 usage(); 133 /*NOTREACHED*/ 134 } 135 } 136 137 if (sufflen + strlen(prefix) >= PATH_MAX) 138 errx(1, "name too long"); 139 140 argc -= optind; 141 argv += optind; 142 143 if ((infn = *argv++) == NULL) 144 usage(); 145 if (strcmp(infn, "-") == 0) { 146 infile = stdin; 147 infn = "stdin"; 148 } else if ((infile = fopen(infn, "r")) == NULL) 149 err(1, "%s", infn); 150 151 if (!kflag) { 152 doclean = 1; 153 (void)atexit(cleanup); 154 sa.sa_flags = 0; 155 sa.sa_handler = handlesig; 156 (void)sigemptyset(&sa.sa_mask); 157 (void)sigaddset(&sa.sa_mask, SIGHUP); 158 (void)sigaddset(&sa.sa_mask, SIGINT); 159 (void)sigaddset(&sa.sa_mask, SIGTERM); 160 (void)sigaction(SIGHUP, &sa, NULL); 161 (void)sigaction(SIGINT, &sa, NULL); 162 (void)sigaction(SIGTERM, &sa, NULL); 163 } 164 165 lineno = 0; 166 nfiles = 0; 167 truncofs = 0; 168 overfile = NULL; 169 170 /* Ensure 10^sufflen < LONG_MAX. */ 171 for (maxfiles = 1, i = 0; i < sufflen; i++) { 172 if (maxfiles > LONG_MAX / 10) 173 errx(1, "%ld: suffix too long (limit %ld)", 174 sufflen, i); 175 maxfiles *= 10; 176 } 177 178 /* Create files based on supplied patterns. */ 179 while (nfiles < maxfiles - 1 && (expr = *argv++) != NULL) { 180 /* Look ahead & see if this pattern has any repetitions. */ 181 if (*argv != NULL && **argv == '{') { 182 errno = 0; 183 reps = strtol(*argv + 1, &ep, 10); 184 if (reps < 0 || *ep != '}' || errno != 0) 185 errx(1, "%s: bad repetition count", *argv + 1); 186 argv++; 187 } else 188 reps = 0; 189 190 if (*expr == '/' || *expr == '%') { 191 do 192 do_rexp(expr); 193 while (reps-- != 0 && nfiles < maxfiles - 1); 194 } else if (isdigit((unsigned char)*expr)) 195 do_lineno(expr); 196 else 197 errx(1, "%s: unrecognised pattern", expr); 198 } 199 200 /* Copy the rest into a new file. */ 201 if (!feof(infile)) { 202 ofp = newfile(); 203 while ((p = getline()) != NULL && fputs(p, ofp) == 0) 204 ; 205 if (!sflag) 206 (void)printf("%jd\n", (intmax_t)ftello(ofp)); 207 if (fclose(ofp) != 0) 208 err(1, "%s", currfile); 209 } 210 211 toomuch(NULL, 0L); 212 doclean = 0; 213 214 return (0); 215} 216 217static void 218usage(void) 219{ 220 221 (void)fprintf(stderr, 222"Usage: %s [-ks] [-f prefix] [-n number] file args ...\n", getprogname()); 223 exit(1); 224} 225 226static void 227handlesig(int sig) 228{ 229 char msg[BUFSIZ]; 230 size_t len; 231 struct sigaction n_hand; 232 sigset_t n_mask; 233 234 len = snprintf(msg, sizeof(msg), "%s: Caught %s, cleaning up\n", 235 getprogname(), strsignal(sig)); 236 if (len < sizeof(msg)) 237 (void)write(STDERR_FILENO, msg, len); 238 cleanup(); 239 /* Reset to default signal handler, clear mask, raise signal */ 240 memset(&n_hand, 0, sizeof n_hand); 241 sigemptyset(&n_hand.sa_mask); 242 n_hand.sa_handler = SIG_DFL; 243 if ((sigaction(sig, &n_hand, NULL) == 0) && 244 (sigemptyset(&n_mask) == 0) && 245 (sigaddset(&n_mask, sig) == 0) && 246 (sigprocmask(SIG_UNBLOCK, &n_mask, 0) == 0)) { 247 raise(sig); 248 } 249 _exit(2); 250} 251 252/* Create a new output file. */ 253static FILE * 254newfile(void) 255{ 256 FILE *fp; 257 258 if ((size_t)snprintf(currfile, sizeof(currfile), "%s%0*ld", prefix, 259 (int)sufflen, nfiles) >= sizeof(currfile)) 260 errx(1, "%s: %s", currfile, strerror(ENAMETOOLONG)); 261 if ((fp = fopen(currfile, "w+")) == NULL) 262 err(1, "%s", currfile); 263 nfiles++; 264 265 return (fp); 266} 267 268/* Remove partial output, called before exiting. */ 269static void 270cleanup(void) 271{ 272 char fnbuf[PATH_MAX]; 273 long i; 274 275 if (!doclean) 276 return; 277 278 /* 279 * NOTE: One cannot portably assume to be able to call snprintf() 280 * from inside a signal handler. It does, however, appear to be safe 281 * to do on FreeBSD and NetBSD. The solution to this problem is worse 282 * than the problem itself. 283 */ 284 285 for (i = 0; i < nfiles; i++) { 286 (void)snprintf(fnbuf, sizeof(fnbuf), "%s%0*ld", prefix, 287 (int)sufflen, i); 288 (void)unlink(fnbuf); 289 } 290} 291 292/* Read a line from the input into a static buffer. */ 293static char * 294getline(void) 295{ 296 static char lbuf[LINE_MAX]; 297 FILE *src; 298 299 src = overfile != NULL ? overfile : infile; 300 301again: if (fgets(lbuf, sizeof(lbuf), src) == NULL) { 302 if (src == overfile) { 303 src = infile; 304 goto again; 305 } 306 return (NULL); 307 } 308 if (ferror(src)) 309 err(1, "%s", infn); 310 lineno++; 311 312 return (lbuf); 313} 314 315/* Conceptually rewind the input (as obtained by getline()) back `n' lines. */ 316static void 317toomuch(FILE *ofp, long n) 318{ 319 char buf[BUFSIZ]; 320 size_t i, nread; 321 322 if (overfile != NULL) { 323 /* 324 * Truncate the previous file we overflowed into back to 325 * the correct length, close it. 326 */ 327 if (fflush(overfile) != 0) 328 err(1, "overflow"); 329 if (ftruncate(fileno(overfile), truncofs) != 0) 330 err(1, "overflow"); 331 if (fclose(overfile) != 0) 332 err(1, "overflow"); 333 overfile = NULL; 334 } 335 336 if (n == 0) 337 /* Just tidying up */ 338 return; 339 340 lineno -= n; 341 342 /* 343 * Wind the overflow file backwards to `n' lines before the 344 * current one. 345 */ 346 do { 347 if (ftello(ofp) < (off_t)sizeof(buf)) 348 rewind(ofp); 349 else 350 (void)fseeko(ofp, -(off_t)sizeof(buf), SEEK_CUR); 351 if (ferror(ofp)) 352 errx(1, "%s: can't seek", currfile); 353 if ((nread = fread(buf, 1, sizeof(buf), ofp)) == 0) 354 errx(1, "can't read overflowed output"); 355 if (fseeko(ofp, -(off_t)nread, SEEK_CUR) != 0) 356 err(1, "%s", currfile); 357 for (i = 1; i <= nread; i++) 358 if (buf[nread - i] == '\n' && n-- == 0) 359 break; 360 if (ftello(ofp) == 0) 361 break; 362 } while (n > 0); 363 if (fseeko(ofp, (off_t)nread - i + 1, SEEK_CUR) != 0) 364 err(1, "%s", currfile); 365 366 /* 367 * getline() will read from here. Next call will truncate to 368 * truncofs in this file. 369 */ 370 overfile = ofp; 371 truncofs = ftello(overfile); 372} 373 374/* Handle splits for /regexp/ and %regexp% patterns. */ 375static void 376do_rexp(const char *expr) 377{ 378 regex_t cre; 379 intmax_t nwritten; 380 long ofs; 381 int first; 382 char *ecopy, *ep, *p, *pofs, *re; 383 FILE *ofp; 384 385 if ((ecopy = strdup(expr)) == NULL) 386 err(1, "strdup"); 387 388 re = ecopy + 1; 389 if ((pofs = strrchr(ecopy, *expr)) == NULL || pofs[-1] == '\\') 390 errx(1, "%s: missing trailing %c", expr, *expr); 391 *pofs++ = '\0'; 392 393 if (*pofs != '\0') { 394 errno = 0; 395 ofs = strtol(pofs, &ep, 10); 396 if (*ep != '\0' || errno != 0) 397 errx(1, "%s: bad offset", pofs); 398 } else 399 ofs = 0; 400 401 if (regcomp(&cre, re, REG_BASIC|REG_NOSUB) != 0) 402 errx(1, "%s: bad regular expression", re); 403 404 if (*expr == '/') 405 /* /regexp/: Save results to a file. */ 406 ofp = newfile(); 407 else { 408 /* %regexp%: Make a temporary file for overflow. */ 409 if ((ofp = tmpfile()) == NULL) 410 err(1, "tmpfile"); 411 } 412 413 /* Read and output lines until we get a match. */ 414 first = 1; 415 while ((p = getline()) != NULL) { 416 if (fputs(p, ofp) != 0) 417 break; 418 if (!first && regexec(&cre, p, 0, NULL, 0) == 0) 419 break; 420 first = 0; 421 } 422 423 if (p == NULL) 424 errx(1, "%s: no match", re); 425 426 if (ofs <= 0) { 427 /* 428 * Negative (or zero) offset: throw back any lines we should 429 * not have read yet. 430 */ 431 if (p != NULL) { 432 toomuch(ofp, -ofs + 1); 433 nwritten = (intmax_t)truncofs; 434 } else 435 nwritten = (intmax_t)ftello(ofp); 436 } else { 437 /* 438 * Positive offset: copy the requested number of lines 439 * after the match. 440 */ 441 while (--ofs > 0 && (p = getline()) != NULL) 442 if (fputs(p, ofp) != 0) 443 break; 444 toomuch(NULL, 0L); 445 nwritten = (intmax_t)ftello(ofp); 446 if (fclose(ofp) != 0) 447 err(1, "%s", currfile); 448 } 449 450 if (!sflag && *expr == '/') 451 (void)printf("%jd\n", nwritten); 452 453 regfree(&cre); 454 free(ecopy); 455} 456 457/* Handle splits based on line number. */ 458static void 459do_lineno(const char *expr) 460{ 461 long lastline, tgtline; 462 char *ep, *p; 463 FILE *ofp; 464 465 errno = 0; 466 tgtline = strtol(expr, &ep, 10); 467 if (tgtline <= 0 || errno != 0 || *ep != '\0') 468 errx(1, "%s: bad line number", expr); 469 lastline = tgtline; 470 if (lastline <= lineno) 471 errx(1, "%s: can't go backwards", expr); 472 473 while (nfiles < maxfiles - 1) { 474 ofp = newfile(); 475 while (lineno + 1 != lastline) { 476 if ((p = getline()) == NULL) 477 errx(1, "%ld: out of range", lastline); 478 if (fputs(p, ofp) != 0) 479 break; 480 } 481 if (!sflag) 482 (void)printf("%jd\n", (intmax_t)ftello(ofp)); 483 if (fclose(ofp) != 0) 484 err(1, "%s", currfile); 485 if (reps-- == 0) 486 break; 487 lastline += tgtline; 488 } 489} 490