1/* split.c -- split a file into pieces. 2 Copyright (C) 1988, 1991, 1995-2010 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation, either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17/* By tege@sics.se, with rms. 18 19 To do: 20 * Implement -t CHAR or -t REGEX to specify break characters other 21 than newline. */ 22 23#include <config.h> 24 25#include <stdio.h> 26#include <getopt.h> 27#include <sys/types.h> 28 29#include "system.h" 30#include "error.h" 31#include "fd-reopen.h" 32#include "fcntl--.h" 33#include "full-read.h" 34#include "full-write.h" 35#include "quote.h" 36#include "safe-read.h" 37#include "xfreopen.h" 38#include "xstrtol.h" 39 40/* The official name of this program (e.g., no `g' prefix). */ 41#define PROGRAM_NAME "split" 42 43#define AUTHORS \ 44 proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \ 45 proper_name ("Richard M. Stallman") 46 47#define DEFAULT_SUFFIX_LENGTH 2 48 49/* Base name of output files. */ 50static char const *outbase; 51 52/* Name of output files. */ 53static char *outfile; 54 55/* Pointer to the end of the prefix in OUTFILE. 56 Suffixes are inserted here. */ 57static char *outfile_mid; 58 59/* Length of OUTFILE's suffix. */ 60static size_t suffix_length = DEFAULT_SUFFIX_LENGTH; 61 62/* Alphabet of characters to use in suffix. */ 63static char const *suffix_alphabet = "abcdefghijklmnopqrstuvwxyz"; 64 65/* Name of input file. May be "-". */ 66static char *infile; 67 68/* Descriptor on which output file is open. */ 69static int output_desc; 70 71/* If true, print a diagnostic on standard error just before each 72 output file is opened. */ 73static bool verbose; 74 75/* For long options that have no equivalent short option, use a 76 non-character as a pseudo short option, starting with CHAR_MAX + 1. */ 77enum 78{ 79 VERBOSE_OPTION = CHAR_MAX + 1 80}; 81 82static struct option const longopts[] = 83{ 84 {"bytes", required_argument, NULL, 'b'}, 85 {"lines", required_argument, NULL, 'l'}, 86 {"line-bytes", required_argument, NULL, 'C'}, 87 {"suffix-length", required_argument, NULL, 'a'}, 88 {"numeric-suffixes", no_argument, NULL, 'd'}, 89 {"verbose", no_argument, NULL, VERBOSE_OPTION}, 90 {GETOPT_HELP_OPTION_DECL}, 91 {GETOPT_VERSION_OPTION_DECL}, 92 {NULL, 0, NULL, 0} 93}; 94 95void 96usage (int status) 97{ 98 if (status != EXIT_SUCCESS) 99 fprintf (stderr, _("Try `%s --help' for more information.\n"), 100 program_name); 101 else 102 { 103 printf (_("\ 104Usage: %s [OPTION]... [INPUT [PREFIX]]\n\ 105"), 106 program_name); 107 fputs (_("\ 108Output fixed-size pieces of INPUT to PREFIXaa, PREFIXab, ...; default\n\ 109size is 1000 lines, and default PREFIX is `x'. With no INPUT, or when INPUT\n\ 110is -, read standard input.\n\ 111\n\ 112"), stdout); 113 fputs (_("\ 114Mandatory arguments to long options are mandatory for short options too.\n\ 115"), stdout); 116 fprintf (stdout, _("\ 117 -a, --suffix-length=N use suffixes of length N (default %d)\n\ 118 -b, --bytes=SIZE put SIZE bytes per output file\n\ 119 -C, --line-bytes=SIZE put at most SIZE bytes of lines per output file\n\ 120 -d, --numeric-suffixes use numeric suffixes instead of alphabetic\n\ 121 -l, --lines=NUMBER put NUMBER lines per output file\n\ 122"), DEFAULT_SUFFIX_LENGTH); 123 fputs (_("\ 124 --verbose print a diagnostic just before each\n\ 125 output file is opened\n\ 126"), stdout); 127 fputs (HELP_OPTION_DESCRIPTION, stdout); 128 fputs (VERSION_OPTION_DESCRIPTION, stdout); 129 emit_size_note (); 130 emit_ancillary_info (); 131 } 132 exit (status); 133} 134 135/* Compute the next sequential output file name and store it into the 136 string `outfile'. */ 137 138static void 139next_file_name (void) 140{ 141 /* Index in suffix_alphabet of each character in the suffix. */ 142 static size_t *sufindex; 143 144 if (! outfile) 145 { 146 /* Allocate and initialize the first file name. */ 147 148 size_t outbase_length = strlen (outbase); 149 size_t outfile_length = outbase_length + suffix_length; 150 if (outfile_length + 1 < outbase_length) 151 xalloc_die (); 152 outfile = xmalloc (outfile_length + 1); 153 outfile_mid = outfile + outbase_length; 154 memcpy (outfile, outbase, outbase_length); 155 memset (outfile_mid, suffix_alphabet[0], suffix_length); 156 outfile[outfile_length] = 0; 157 sufindex = xcalloc (suffix_length, sizeof *sufindex); 158 159#if ! _POSIX_NO_TRUNC && HAVE_PATHCONF && defined _PC_NAME_MAX 160 /* POSIX requires that if the output file name is too long for 161 its directory, `split' must fail without creating any files. 162 This must be checked for explicitly on operating systems that 163 silently truncate file names. */ 164 { 165 char *dir = dir_name (outfile); 166 long name_max = pathconf (dir, _PC_NAME_MAX); 167 if (0 <= name_max && name_max < base_len (last_component (outfile))) 168 error (EXIT_FAILURE, ENAMETOOLONG, "%s", outfile); 169 free (dir); 170 } 171#endif 172 } 173 else 174 { 175 /* Increment the suffix in place, if possible. */ 176 177 size_t i = suffix_length; 178 while (i-- != 0) 179 { 180 sufindex[i]++; 181 outfile_mid[i] = suffix_alphabet[sufindex[i]]; 182 if (outfile_mid[i]) 183 return; 184 sufindex[i] = 0; 185 outfile_mid[i] = suffix_alphabet[sufindex[i]]; 186 } 187 error (EXIT_FAILURE, 0, _("output file suffixes exhausted")); 188 } 189} 190 191/* Write BYTES bytes at BP to an output file. 192 If NEW_FILE_FLAG is true, open the next output file. 193 Otherwise add to the same output file already in use. */ 194 195static void 196cwrite (bool new_file_flag, const char *bp, size_t bytes) 197{ 198 if (new_file_flag) 199 { 200 if (output_desc >= 0 && close (output_desc) < 0) 201 error (EXIT_FAILURE, errno, "%s", outfile); 202 203 next_file_name (); 204 if (verbose) 205 fprintf (stdout, _("creating file %s\n"), quote (outfile)); 206 output_desc = open (outfile, 207 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 208 (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP 209 | S_IROTH | S_IWOTH)); 210 if (output_desc < 0) 211 error (EXIT_FAILURE, errno, "%s", outfile); 212 } 213 if (full_write (output_desc, bp, bytes) != bytes) 214 error (EXIT_FAILURE, errno, "%s", outfile); 215} 216 217/* Split into pieces of exactly N_BYTES bytes. 218 Use buffer BUF, whose size is BUFSIZE. */ 219 220static void 221bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize) 222{ 223 size_t n_read; 224 bool new_file_flag = true; 225 size_t to_read; 226 uintmax_t to_write = n_bytes; 227 char *bp_out; 228 229 do 230 { 231 n_read = full_read (STDIN_FILENO, buf, bufsize); 232 if (n_read == SAFE_READ_ERROR) 233 error (EXIT_FAILURE, errno, "%s", infile); 234 bp_out = buf; 235 to_read = n_read; 236 for (;;) 237 { 238 if (to_read < to_write) 239 { 240 if (to_read) /* do not write 0 bytes! */ 241 { 242 cwrite (new_file_flag, bp_out, to_read); 243 to_write -= to_read; 244 new_file_flag = false; 245 } 246 break; 247 } 248 else 249 { 250 size_t w = to_write; 251 cwrite (new_file_flag, bp_out, w); 252 bp_out += w; 253 to_read -= w; 254 new_file_flag = true; 255 to_write = n_bytes; 256 } 257 } 258 } 259 while (n_read == bufsize); 260} 261 262/* Split into pieces of exactly N_LINES lines. 263 Use buffer BUF, whose size is BUFSIZE. */ 264 265static void 266lines_split (uintmax_t n_lines, char *buf, size_t bufsize) 267{ 268 size_t n_read; 269 char *bp, *bp_out, *eob; 270 bool new_file_flag = true; 271 uintmax_t n = 0; 272 273 do 274 { 275 n_read = full_read (STDIN_FILENO, buf, bufsize); 276 if (n_read == SAFE_READ_ERROR) 277 error (EXIT_FAILURE, errno, "%s", infile); 278 bp = bp_out = buf; 279 eob = bp + n_read; 280 *eob = '\n'; 281 for (;;) 282 { 283 bp = memchr (bp, '\n', eob - bp + 1); 284 if (bp == eob) 285 { 286 if (eob != bp_out) /* do not write 0 bytes! */ 287 { 288 size_t len = eob - bp_out; 289 cwrite (new_file_flag, bp_out, len); 290 new_file_flag = false; 291 } 292 break; 293 } 294 295 ++bp; 296 if (++n >= n_lines) 297 { 298 cwrite (new_file_flag, bp_out, bp - bp_out); 299 bp_out = bp; 300 new_file_flag = true; 301 n = 0; 302 } 303 } 304 } 305 while (n_read == bufsize); 306} 307 308/* Split into pieces that are as large as possible while still not more 309 than N_BYTES bytes, and are split on line boundaries except 310 where lines longer than N_BYTES bytes occur. 311 FIXME: Allow N_BYTES to be any uintmax_t value, and don't require a 312 buffer of size N_BYTES, in case N_BYTES is very large. */ 313 314static void 315line_bytes_split (size_t n_bytes) 316{ 317 size_t n_read; 318 char *bp; 319 bool eof = false; 320 size_t n_buffered = 0; 321 char *buf = xmalloc (n_bytes); 322 323 do 324 { 325 /* Fill up the full buffer size from the input file. */ 326 327 n_read = full_read (STDIN_FILENO, buf + n_buffered, n_bytes - n_buffered); 328 if (n_read == SAFE_READ_ERROR) 329 error (EXIT_FAILURE, errno, "%s", infile); 330 331 n_buffered += n_read; 332 if (n_buffered != n_bytes) 333 { 334 if (n_buffered == 0) 335 break; 336 eof = true; 337 } 338 339 /* Find where to end this chunk. */ 340 bp = buf + n_buffered; 341 if (n_buffered == n_bytes) 342 { 343 while (bp > buf && bp[-1] != '\n') 344 bp--; 345 } 346 347 /* If chunk has no newlines, use all the chunk. */ 348 if (bp == buf) 349 bp = buf + n_buffered; 350 351 /* Output the chars as one output file. */ 352 cwrite (true, buf, bp - buf); 353 354 /* Discard the chars we just output; move rest of chunk 355 down to be the start of the next chunk. Source and 356 destination probably overlap. */ 357 n_buffered -= bp - buf; 358 if (n_buffered > 0) 359 memmove (buf, bp, n_buffered); 360 } 361 while (!eof); 362 free (buf); 363} 364 365#define FAIL_ONLY_ONE_WAY() \ 366 do \ 367 { \ 368 error (0, 0, _("cannot split in more than one way")); \ 369 usage (EXIT_FAILURE); \ 370 } \ 371 while (0) 372 373int 374main (int argc, char **argv) 375{ 376 struct stat stat_buf; 377 enum 378 { 379 type_undef, type_bytes, type_byteslines, type_lines, type_digits 380 } split_type = type_undef; 381 size_t in_blk_size; /* optimal block size of input file device */ 382 char *buf; /* file i/o buffer */ 383 size_t page_size = getpagesize (); 384 uintmax_t n_units; 385 static char const multipliers[] = "bEGKkMmPTYZ0"; 386 int c; 387 int digits_optind = 0; 388 389 initialize_main (&argc, &argv); 390 set_program_name (argv[0]); 391 setlocale (LC_ALL, ""); 392 bindtextdomain (PACKAGE, LOCALEDIR); 393 textdomain (PACKAGE); 394 395 atexit (close_stdout); 396 397 /* Parse command line options. */ 398 399 infile = bad_cast ( "-"); 400 outbase = bad_cast ("x"); 401 402 while (1) 403 { 404 /* This is the argv-index of the option we will read next. */ 405 int this_optind = optind ? optind : 1; 406 407 c = getopt_long (argc, argv, "0123456789C:a:b:dl:", longopts, NULL); 408 if (c == -1) 409 break; 410 411 switch (c) 412 { 413 case 'a': 414 { 415 unsigned long tmp; 416 if (xstrtoul (optarg, NULL, 10, &tmp, "") != LONGINT_OK 417 || SIZE_MAX / sizeof (size_t) < tmp) 418 { 419 error (0, 0, _("%s: invalid suffix length"), optarg); 420 usage (EXIT_FAILURE); 421 } 422 suffix_length = tmp; 423 } 424 break; 425 426 case 'b': 427 if (split_type != type_undef) 428 FAIL_ONLY_ONE_WAY (); 429 split_type = type_bytes; 430 if (xstrtoumax (optarg, NULL, 10, &n_units, multipliers) != LONGINT_OK 431 || n_units == 0) 432 { 433 error (0, 0, _("%s: invalid number of bytes"), optarg); 434 usage (EXIT_FAILURE); 435 } 436 break; 437 438 case 'l': 439 if (split_type != type_undef) 440 FAIL_ONLY_ONE_WAY (); 441 split_type = type_lines; 442 if (xstrtoumax (optarg, NULL, 10, &n_units, "") != LONGINT_OK 443 || n_units == 0) 444 { 445 error (0, 0, _("%s: invalid number of lines"), optarg); 446 usage (EXIT_FAILURE); 447 } 448 break; 449 450 case 'C': 451 if (split_type != type_undef) 452 FAIL_ONLY_ONE_WAY (); 453 split_type = type_byteslines; 454 if (xstrtoumax (optarg, NULL, 10, &n_units, multipliers) != LONGINT_OK 455 || n_units == 0 || SIZE_MAX < n_units) 456 { 457 error (0, 0, _("%s: invalid number of bytes"), optarg); 458 usage (EXIT_FAILURE); 459 } 460 break; 461 462 case '0': 463 case '1': 464 case '2': 465 case '3': 466 case '4': 467 case '5': 468 case '6': 469 case '7': 470 case '8': 471 case '9': 472 if (split_type == type_undef) 473 { 474 split_type = type_digits; 475 n_units = 0; 476 } 477 if (split_type != type_undef && split_type != type_digits) 478 FAIL_ONLY_ONE_WAY (); 479 if (digits_optind != 0 && digits_optind != this_optind) 480 n_units = 0; /* More than one number given; ignore other. */ 481 digits_optind = this_optind; 482 if (!DECIMAL_DIGIT_ACCUMULATE (n_units, c - '0', uintmax_t)) 483 { 484 char buffer[INT_BUFSIZE_BOUND (uintmax_t)]; 485 error (EXIT_FAILURE, 0, 486 _("line count option -%s%c... is too large"), 487 umaxtostr (n_units, buffer), c); 488 } 489 break; 490 491 case 'd': 492 suffix_alphabet = "0123456789"; 493 break; 494 495 case VERBOSE_OPTION: 496 verbose = true; 497 break; 498 499 case_GETOPT_HELP_CHAR; 500 501 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); 502 503 default: 504 usage (EXIT_FAILURE); 505 } 506 } 507 508 /* Handle default case. */ 509 if (split_type == type_undef) 510 { 511 split_type = type_lines; 512 n_units = 1000; 513 } 514 515 if (n_units == 0) 516 { 517 error (0, 0, _("invalid number of lines: 0")); 518 usage (EXIT_FAILURE); 519 } 520 521 /* Get out the filename arguments. */ 522 523 if (optind < argc) 524 infile = argv[optind++]; 525 526 if (optind < argc) 527 outbase = argv[optind++]; 528 529 if (optind < argc) 530 { 531 error (0, 0, _("extra operand %s"), quote (argv[optind])); 532 usage (EXIT_FAILURE); 533 } 534 535 /* Open the input file. */ 536 if (! STREQ (infile, "-") 537 && fd_reopen (STDIN_FILENO, infile, O_RDONLY, 0) < 0) 538 error (EXIT_FAILURE, errno, _("cannot open %s for reading"), 539 quote (infile)); 540 541 /* Binary I/O is safer when bytecounts are used. */ 542 if (O_BINARY && ! isatty (STDIN_FILENO)) 543 xfreopen (NULL, "rb", stdin); 544 545 /* No output file is open now. */ 546 output_desc = -1; 547 548 /* Get the optimal block size of input device and make a buffer. */ 549 550 if (fstat (STDIN_FILENO, &stat_buf) != 0) 551 error (EXIT_FAILURE, errno, "%s", infile); 552 in_blk_size = io_blksize (stat_buf); 553 554 buf = ptr_align (xmalloc (in_blk_size + 1 + page_size - 1), page_size); 555 556 switch (split_type) 557 { 558 case type_digits: 559 case type_lines: 560 lines_split (n_units, buf, in_blk_size); 561 break; 562 563 case type_bytes: 564 bytes_split (n_units, buf, in_blk_size); 565 break; 566 567 case type_byteslines: 568 line_bytes_split (n_units); 569 break; 570 571 default: 572 abort (); 573 } 574 575 if (close (STDIN_FILENO) != 0) 576 error (EXIT_FAILURE, errno, "%s", infile); 577 if (output_desc >= 0 && close (output_desc) < 0) 578 error (EXIT_FAILURE, errno, "%s", outfile); 579 580 exit (EXIT_SUCCESS); 581} 582