cut.c revision 98035
11590Srgrimes/*
21590Srgrimes * Copyright (c) 1989, 1993
31590Srgrimes *	The Regents of the University of California.  All rights reserved.
41590Srgrimes *
51590Srgrimes * This code is derived from software contributed to Berkeley by
61590Srgrimes * Adam S. Moskowitz of Menlo Consulting and Marciano Pitargue.
71590Srgrimes *
81590Srgrimes * Redistribution and use in source and binary forms, with or without
91590Srgrimes * modification, are permitted provided that the following conditions
101590Srgrimes * are met:
111590Srgrimes * 1. Redistributions of source code must retain the above copyright
121590Srgrimes *    notice, this list of conditions and the following disclaimer.
131590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
141590Srgrimes *    notice, this list of conditions and the following disclaimer in the
151590Srgrimes *    documentation and/or other materials provided with the distribution.
161590Srgrimes * 3. All advertising materials mentioning features or use of this software
171590Srgrimes *    must display the following acknowledgement:
181590Srgrimes *	This product includes software developed by the University of
191590Srgrimes *	California, Berkeley and its contributors.
201590Srgrimes * 4. Neither the name of the University nor the names of its contributors
211590Srgrimes *    may be used to endorse or promote products derived from this software
221590Srgrimes *    without specific prior written permission.
231590Srgrimes *
241590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
251590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
261590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
271590Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
281590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
291590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
301590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
311590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
321590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
331590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
341590Srgrimes * SUCH DAMAGE.
351590Srgrimes */
361590Srgrimes
371590Srgrimes#ifndef lint
3841568Sarchiestatic const char copyright[] =
391590Srgrimes"@(#) Copyright (c) 1989, 1993\n\
401590Srgrimes	The Regents of the University of California.  All rights reserved.\n";
4141568Sarchiestatic const char sccsid[] = "@(#)cut.c	8.3 (Berkeley) 5/4/95";
4271725Swillstatic const char rcsid[] =
4371725Swill  "$FreeBSD: head/usr.bin/cut/cut.c 98035 2002-06-08 07:27:21Z tjr $";
441590Srgrimes#endif /* not lint */
451590Srgrimes
461590Srgrimes#include <ctype.h>
4727098Scharnier#include <err.h>
481590Srgrimes#include <limits.h>
4943531Seivind#include <locale.h>
501590Srgrimes#include <stdio.h>
511590Srgrimes#include <stdlib.h>
521590Srgrimes#include <string.h>
5323693Speter#include <unistd.h>
541590Srgrimes
5598035Stjrint	bflag;
561590Srgrimesint	cflag;
571590Srgrimeschar	dchar;
581590Srgrimesint	dflag;
591590Srgrimesint	fflag;
6098035Stjrint	nflag;
611590Srgrimesint	sflag;
621590Srgrimes
6398035Stjrvoid	b_n_cut(FILE *, const char *);
6498012Stjrvoid	c_cut(FILE *, const char *);
6598012Stjrvoid	f_cut(FILE *, const char *);
6698012Stjrvoid	get_list(char *);
6798012Stjrint	main(int, char **);
6897234Stjrvoid	needpos(size_t);
6998012Stjrstatic 	void usage(void);
701590Srgrimes
711590Srgrimesint
721590Srgrimesmain(argc, argv)
731590Srgrimes	int argc;
741590Srgrimes	char *argv[];
751590Srgrimes{
761590Srgrimes	FILE *fp;
7798012Stjr	void (*fcn)(FILE *, const char *);
7897218Stjr	int ch, rval;
791590Srgrimes
8098012Stjr	setlocale(LC_ALL, "");
8198012Stjr
8243533Seivind	fcn = NULL;
831590Srgrimes	dchar = '\t';			/* default delimiter is \t */
841590Srgrimes
8598035Stjr	/*
8698035Stjr	 * Since we don't support multi-byte characters, the -c and -b
8798035Stjr	 * options are equivalent.
8898035Stjr	 */
8943532Seivind	while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1)
901590Srgrimes		switch(ch) {
9143531Seivind		case 'b':
9298035Stjr			fcn = c_cut;
9398035Stjr			get_list(optarg);
9498035Stjr			bflag = 1;
9598035Stjr			break;
961590Srgrimes		case 'c':
971590Srgrimes			fcn = c_cut;
981590Srgrimes			get_list(optarg);
991590Srgrimes			cflag = 1;
1001590Srgrimes			break;
1011590Srgrimes		case 'd':
1021590Srgrimes			dchar = *optarg;
1031590Srgrimes			dflag = 1;
1041590Srgrimes			break;
1051590Srgrimes		case 'f':
1061590Srgrimes			get_list(optarg);
1071590Srgrimes			fcn = f_cut;
1081590Srgrimes			fflag = 1;
1091590Srgrimes			break;
1101590Srgrimes		case 's':
1111590Srgrimes			sflag = 1;
1121590Srgrimes			break;
11343531Seivind		case 'n':
11498035Stjr			nflag = 1;
11543531Seivind			break;
1161590Srgrimes		case '?':
1171590Srgrimes		default:
1181590Srgrimes			usage();
1191590Srgrimes		}
1201590Srgrimes	argc -= optind;
1211590Srgrimes	argv += optind;
1221590Srgrimes
1231590Srgrimes	if (fflag) {
12498035Stjr		if (bflag || cflag || nflag)
1251590Srgrimes			usage();
12698035Stjr	} else if (!(bflag || cflag) || dflag || sflag)
1271590Srgrimes		usage();
12898035Stjr	else if (!bflag && nflag)
12998035Stjr		usage();
1301590Srgrimes
13198035Stjr	if (nflag)
13298035Stjr		fcn = b_n_cut;
13398035Stjr
13497218Stjr	rval = 0;
1351590Srgrimes	if (*argv)
1361590Srgrimes		for (; *argv; ++argv) {
13797237Stjr			if (strcmp(*argv, "-") == 0)
13897237Stjr				fcn(stdin, "stdin");
13997237Stjr			else {
14097237Stjr				if (!(fp = fopen(*argv, "r"))) {
14197237Stjr					warn("%s", *argv);
14297237Stjr					rval = 1;
14397237Stjr					continue;
14497237Stjr				}
14597237Stjr				fcn(fp, *argv);
14697237Stjr				(void)fclose(fp);
14797218Stjr			}
1481590Srgrimes		}
1491590Srgrimes	else
1501590Srgrimes		fcn(stdin, "stdin");
15197218Stjr	exit(rval);
1521590Srgrimes}
1531590Srgrimes
15471725Swillsize_t autostart, autostop, maxval;
1551590Srgrimes
15697234Stjrchar *positions;
1571590Srgrimes
1581590Srgrimesvoid
1591590Srgrimesget_list(list)
1601590Srgrimes	char *list;
1611590Srgrimes{
16271725Swill	size_t setautostart, start, stop;
16343533Seivind	char *pos;
1641590Srgrimes	char *p;
1651590Srgrimes
1661590Srgrimes	/*
1671590Srgrimes	 * set a byte in the positions array to indicate if a field or
1681590Srgrimes	 * column is to be selected; use +1, it's 1-based, not 0-based.
1691590Srgrimes	 * This parser is less restrictive than the Draft 9 POSIX spec.
1701590Srgrimes	 * POSIX doesn't allow lists that aren't in increasing order or
1711590Srgrimes	 * overlapping lists.  We also handle "-3-5" although there's no
1721590Srgrimes	 * real reason too.
1731590Srgrimes	 */
17443533Seivind	for (; (p = strsep(&list, ", \t")) != NULL;) {
1751590Srgrimes		setautostart = start = stop = 0;
1761590Srgrimes		if (*p == '-') {
1771590Srgrimes			++p;
1781590Srgrimes			setautostart = 1;
1791590Srgrimes		}
18043533Seivind		if (isdigit((unsigned char)*p)) {
1811590Srgrimes			start = stop = strtol(p, &p, 10);
1821590Srgrimes			if (setautostart && start > autostart)
1831590Srgrimes				autostart = start;
1841590Srgrimes		}
1851590Srgrimes		if (*p == '-') {
18643533Seivind			if (isdigit((unsigned char)p[1]))
1871590Srgrimes				stop = strtol(p + 1, &p, 10);
1881590Srgrimes			if (*p == '-') {
1891590Srgrimes				++p;
1901590Srgrimes				if (!autostop || autostop > stop)
1911590Srgrimes					autostop = stop;
1921590Srgrimes			}
1931590Srgrimes		}
1941590Srgrimes		if (*p)
19527098Scharnier			errx(1, "[-cf] list: illegal list value");
1961590Srgrimes		if (!stop || !start)
19727098Scharnier			errx(1, "[-cf] list: values may not include zero");
19897234Stjr		if (maxval < stop) {
1991590Srgrimes			maxval = stop;
20097234Stjr			needpos(maxval + 1);
20197234Stjr		}
2021590Srgrimes		for (pos = positions + start; start++ <= stop; *pos++ = 1);
2031590Srgrimes	}
2041590Srgrimes
2051590Srgrimes	/* overlapping ranges */
20697234Stjr	if (autostop && maxval > autostop) {
2071590Srgrimes		maxval = autostop;
20897234Stjr		needpos(maxval + 1);
20997234Stjr	}
2101590Srgrimes
2111590Srgrimes	/* set autostart */
2121590Srgrimes	if (autostart)
2131590Srgrimes		memset(positions + 1, '1', autostart);
2141590Srgrimes}
2151590Srgrimes
21697234Stjrvoid
21797234Stjrneedpos(size_t n)
21897234Stjr{
21997234Stjr	static size_t npos;
22097328Stjr	size_t oldnpos;
22197234Stjr
22297234Stjr	/* Grow the positions array to at least the specified size. */
22397234Stjr	if (n > npos) {
22497328Stjr		oldnpos = npos;
22597234Stjr		if (npos == 0)
22697234Stjr			npos = n;
22797234Stjr		while (n > npos)
22897234Stjr			npos *= 2;
22997234Stjr		if ((positions = realloc(positions, npos)) == NULL)
23097234Stjr			err(1, "realloc");
23197328Stjr		memset((char *)positions + oldnpos, 0, npos - oldnpos);
23297234Stjr	}
23397234Stjr}
23497234Stjr
23598035Stjr/*
23698035Stjr * Cut based on byte positions, taking care not to split multibyte characters.
23798035Stjr * Although this function also handles the case where -n is not specified,
23898035Stjr * c_cut() ought to be much faster.
23998035Stjr */
2401590Srgrimesvoid
24198035Stjrb_n_cut(fp, fname)
24298035Stjr	FILE *fp;
24398035Stjr	const char *fname;
24498035Stjr{
24598035Stjr	size_t col, i, lbuflen;
24698035Stjr	char *lbuf;
24798035Stjr	int canwrite, clen, warned;
24898035Stjr
24998035Stjr	warned = 0;
25098035Stjr	while ((lbuf = fgetln(fp, &lbuflen)) != NULL) {
25198035Stjr		for (col = 0; lbuflen > 0; col += clen) {
25298035Stjr			if ((clen = mblen(lbuf, lbuflen)) < 0) {
25398035Stjr				if (!warned) {
25498035Stjr					warn("%s", fname);
25598035Stjr					warned = 1;
25698035Stjr				}
25798035Stjr				clen = 1;
25898035Stjr			}
25998035Stjr			if (clen == 0 || *lbuf == '\n')
26098035Stjr				break;
26198035Stjr			if (col < maxval && !positions[1 + col]) {
26298035Stjr				/*
26398035Stjr				 * Print the character if (1) after an initial
26498035Stjr				 * segment of un-selected bytes, the rest of
26598035Stjr				 * it is selected, and (2) the last byte is
26698035Stjr				 * selected.
26798035Stjr				 */
26898035Stjr				i = col;
26998035Stjr				while (i < col + clen && i < maxval &&
27098035Stjr				    !positions[1 + i])
27198035Stjr					i++;
27298035Stjr				canwrite = i < col + clen;
27398035Stjr				for (; i < col + clen && i < maxval; i++)
27498035Stjr					canwrite &= positions[1 + i];
27598035Stjr				if (canwrite)
27698035Stjr					fwrite(lbuf, 1, clen, stdout);
27798035Stjr			} else {
27898035Stjr				/*
27998035Stjr				 * Print the character if all of it has
28098035Stjr				 * been selected.
28198035Stjr				 */
28298035Stjr				canwrite = 1;
28398035Stjr				for (i = col; i < col + clen; i++)
28498035Stjr					if ((i >= maxval && !autostop) ||
28598035Stjr					    (i < maxval && !positions[1 + i])) {
28698035Stjr						canwrite = 0;
28798035Stjr						break;
28898035Stjr					}
28998035Stjr				if (canwrite)
29098035Stjr					fwrite(lbuf, 1, clen, stdout);
29198035Stjr			}
29298035Stjr			lbuf += clen;
29398035Stjr			lbuflen -= clen;
29498035Stjr		}
29598035Stjr		if (lbuflen > 0)
29698035Stjr			putchar('\n');
29798035Stjr	}
29898035Stjr}
29998035Stjr
30098035Stjrvoid
3011590Srgrimesc_cut(fp, fname)
3021590Srgrimes	FILE *fp;
30398012Stjr	const char *fname __unused;
3041590Srgrimes{
30543533Seivind	int ch, col;
30643533Seivind	char *pos;
3071590Srgrimes
30843533Seivind	ch = 0;
3091590Srgrimes	for (;;) {
3101590Srgrimes		pos = positions + 1;
3111590Srgrimes		for (col = maxval; col; --col) {
3121590Srgrimes			if ((ch = getc(fp)) == EOF)
3131590Srgrimes				return;
3141590Srgrimes			if (ch == '\n')
3151590Srgrimes				break;
3161590Srgrimes			if (*pos++)
3171590Srgrimes				(void)putchar(ch);
3181590Srgrimes		}
31943533Seivind		if (ch != '\n') {
3201590Srgrimes			if (autostop)
3211590Srgrimes				while ((ch = getc(fp)) != EOF && ch != '\n')
3221590Srgrimes					(void)putchar(ch);
3231590Srgrimes			else
3241590Srgrimes				while ((ch = getc(fp)) != EOF && ch != '\n');
32543533Seivind		}
3261590Srgrimes		(void)putchar('\n');
3271590Srgrimes	}
3281590Srgrimes}
3291590Srgrimes
3301590Srgrimesvoid
3311590Srgrimesf_cut(fp, fname)
3321590Srgrimes	FILE *fp;
33377852Sdd	const char *fname __unused;
3341590Srgrimes{
33543533Seivind	int ch, field, isdelim;
33643533Seivind	char *pos, *p, sep;
3371590Srgrimes	int output;
33898012Stjr	char *lbuf, *mlbuf;
33975930Sdd	size_t lbuflen;
3401590Srgrimes
34198012Stjr	mlbuf = NULL;
34275930Sdd	for (sep = dchar; (lbuf = fgetln(fp, &lbuflen)) != NULL;) {
34375930Sdd		/* Assert EOL has a newline. */
34475930Sdd		if (*(lbuf + lbuflen - 1) != '\n') {
34575930Sdd			/* Can't have > 1 line with no trailing newline. */
34675930Sdd			mlbuf = malloc(lbuflen + 1);
34775930Sdd			if (mlbuf == NULL)
34875930Sdd				err(1, "malloc");
34975930Sdd			memcpy(mlbuf, lbuf, lbuflen);
35075930Sdd			*(mlbuf + lbuflen) = '\n';
35175930Sdd			lbuf = mlbuf;
35275930Sdd		}
3537200Sache		output = 0;
3541590Srgrimes		for (isdelim = 0, p = lbuf;; ++p) {
35575930Sdd			ch = *p;
3561590Srgrimes			/* this should work if newline is delimiter */
3571590Srgrimes			if (ch == sep)
3581590Srgrimes				isdelim = 1;
3591590Srgrimes			if (ch == '\n') {
3601590Srgrimes				if (!isdelim && !sflag)
36175930Sdd					(void)fwrite(lbuf, lbuflen, 1, stdout);
3621590Srgrimes				break;
3631590Srgrimes			}
3641590Srgrimes		}
3651590Srgrimes		if (!isdelim)
3661590Srgrimes			continue;
3671590Srgrimes
3681590Srgrimes		pos = positions + 1;
3691590Srgrimes		for (field = maxval, p = lbuf; field; --field, ++pos) {
3701590Srgrimes			if (*pos) {
3711590Srgrimes				if (output++)
3721590Srgrimes					(void)putchar(sep);
3731590Srgrimes				while ((ch = *p++) != '\n' && ch != sep)
3741590Srgrimes					(void)putchar(ch);
37543533Seivind			} else {
37643533Seivind				while ((ch = *p++) != '\n' && ch != sep)
37743533Seivind					continue;
37843533Seivind			}
3791590Srgrimes			if (ch == '\n')
3801590Srgrimes				break;
3811590Srgrimes		}
38243533Seivind		if (ch != '\n') {
3831590Srgrimes			if (autostop) {
3841590Srgrimes				if (output)
3851590Srgrimes					(void)putchar(sep);
3861590Srgrimes				for (; (ch = *p) != '\n'; ++p)
3871590Srgrimes					(void)putchar(ch);
3881590Srgrimes			} else
3891590Srgrimes				for (; (ch = *p) != '\n'; ++p);
39043533Seivind		}
3911590Srgrimes		(void)putchar('\n');
3921590Srgrimes	}
39375930Sdd	if (mlbuf != NULL)
39475930Sdd		free(mlbuf);
3951590Srgrimes}
3961590Srgrimes
39727098Scharnierstatic void
3981590Srgrimesusage()
3991590Srgrimes{
40043531Seivind	(void)fprintf(stderr, "%s\n%s\n%s\n",
40143531Seivind		"usage: cut -b list [-n] [file ...]",
40243531Seivind		"       cut -c list [file ...]",
40327098Scharnier		"       cut -f list [-s] [-d delim] [file ...]");
4041590Srgrimes	exit(1);
4051590Srgrimes}
406