11590Srgrimes/*
21590Srgrimes * Copyright (c) 1989, 1993
31590Srgrimes *	The Regents of the University of California.  All rights reserved.
41590Srgrimes *
51590Srgrimes * This code is derived from software contributed to Berkeley by
61590Srgrimes * Adam S. Moskowitz of Menlo Consulting and Marciano Pitargue.
71590Srgrimes *
81590Srgrimes * Redistribution and use in source and binary forms, with or without
91590Srgrimes * modification, are permitted provided that the following conditions
101590Srgrimes * are met:
111590Srgrimes * 1. Redistributions of source code must retain the above copyright
121590Srgrimes *    notice, this list of conditions and the following disclaimer.
131590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
141590Srgrimes *    notice, this list of conditions and the following disclaimer in the
151590Srgrimes *    documentation and/or other materials provided with the distribution.
161590Srgrimes * 4. Neither the name of the University nor the names of its contributors
171590Srgrimes *    may be used to endorse or promote products derived from this software
181590Srgrimes *    without specific prior written permission.
191590Srgrimes *
201590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
211590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
221590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
231590Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
241590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
251590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
261590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
271590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
281590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
291590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
301590Srgrimes * SUCH DAMAGE.
311590Srgrimes */
321590Srgrimes
331590Srgrimes#ifndef lint
3441568Sarchiestatic const char copyright[] =
351590Srgrimes"@(#) Copyright (c) 1989, 1993\n\
361590Srgrimes	The Regents of the University of California.  All rights reserved.\n";
3741568Sarchiestatic const char sccsid[] = "@(#)cut.c	8.3 (Berkeley) 5/4/95";
381590Srgrimes#endif /* not lint */
3999115Sobrien#include <sys/cdefs.h>
4099115Sobrien__FBSDID("$FreeBSD$");
411590Srgrimes
421590Srgrimes#include <ctype.h>
4327098Scharnier#include <err.h>
44131197Stjr#include <errno.h>
451590Srgrimes#include <limits.h>
4643531Seivind#include <locale.h>
471590Srgrimes#include <stdio.h>
481590Srgrimes#include <stdlib.h>
491590Srgrimes#include <string.h>
5023693Speter#include <unistd.h>
51131183Stjr#include <wchar.h>
521590Srgrimes
53227162Sedstatic int	bflag;
54227162Sedstatic int	cflag;
55227162Sedstatic wchar_t	dchar;
56227162Sedstatic char	dcharmb[MB_LEN_MAX + 1];
57227162Sedstatic int	dflag;
58227162Sedstatic int	fflag;
59227162Sedstatic int	nflag;
60227162Sedstatic int	sflag;
61243320Seadlerstatic int	wflag;
621590Srgrimes
63227162Sedstatic size_t	autostart, autostop, maxval;
64227162Sedstatic char *	positions;
65131226Stjr
66227162Sedstatic int	b_cut(FILE *, const char *);
67227162Sedstatic int	b_n_cut(FILE *, const char *);
68227162Sedstatic int	c_cut(FILE *, const char *);
69227162Sedstatic int	f_cut(FILE *, const char *);
70227162Sedstatic void	get_list(char *);
71243474Sandrewstatic int	is_delim(wchar_t);
72227162Sedstatic void	needpos(size_t);
73227162Sedstatic void	usage(void);
741590Srgrimes
751590Srgrimesint
76102944Sdwmalonemain(int argc, char *argv[])
771590Srgrimes{
781590Srgrimes	FILE *fp;
79131194Stjr	int (*fcn)(FILE *, const char *);
8097218Stjr	int ch, rval;
81131197Stjr	size_t n;
821590Srgrimes
8398012Stjr	setlocale(LC_ALL, "");
8498012Stjr
8543533Seivind	fcn = NULL;
861590Srgrimes	dchar = '\t';			/* default delimiter is \t */
87131197Stjr	strcpy(dcharmb, "\t");
881590Srgrimes
89243320Seadler	while ((ch = getopt(argc, argv, "b:c:d:f:snw")) != -1)
901590Srgrimes		switch(ch) {
9143531Seivind		case 'b':
9298035Stjr			get_list(optarg);
9398035Stjr			bflag = 1;
9498035Stjr			break;
951590Srgrimes		case 'c':
961590Srgrimes			get_list(optarg);
971590Srgrimes			cflag = 1;
981590Srgrimes			break;
991590Srgrimes		case 'd':
100131197Stjr			n = mbrtowc(&dchar, optarg, MB_LEN_MAX, NULL);
101131197Stjr			if (dchar == '\0' || n != strlen(optarg))
102131197Stjr				errx(1, "bad delimiter");
103131197Stjr			strcpy(dcharmb, optarg);
1041590Srgrimes			dflag = 1;
1051590Srgrimes			break;
1061590Srgrimes		case 'f':
1071590Srgrimes			get_list(optarg);
1081590Srgrimes			fflag = 1;
1091590Srgrimes			break;
1101590Srgrimes		case 's':
1111590Srgrimes			sflag = 1;
1121590Srgrimes			break;
11343531Seivind		case 'n':
11498035Stjr			nflag = 1;
11543531Seivind			break;
116243320Seadler		case 'w':
117243320Seadler			wflag = 1;
118243320Seadler			break;
1191590Srgrimes		case '?':
1201590Srgrimes		default:
1211590Srgrimes			usage();
1221590Srgrimes		}
1231590Srgrimes	argc -= optind;
1241590Srgrimes	argv += optind;
1251590Srgrimes
1261590Srgrimes	if (fflag) {
127243320Seadler		if (bflag || cflag || nflag || (wflag && dflag))
1281590Srgrimes			usage();
129243320Seadler	} else if (!(bflag || cflag) || dflag || sflag || wflag)
1301590Srgrimes		usage();
13198035Stjr	else if (!bflag && nflag)
13298035Stjr		usage();
1331590Srgrimes
134131194Stjr	if (fflag)
135131194Stjr		fcn = f_cut;
136131194Stjr	else if (cflag)
137131194Stjr		fcn = MB_CUR_MAX > 1 ? c_cut : b_cut;
138131194Stjr	else if (bflag)
139131194Stjr		fcn = nflag && MB_CUR_MAX > 1 ? b_n_cut : b_cut;
14098035Stjr
14197218Stjr	rval = 0;
1421590Srgrimes	if (*argv)
1431590Srgrimes		for (; *argv; ++argv) {
14497237Stjr			if (strcmp(*argv, "-") == 0)
145131194Stjr				rval |= fcn(stdin, "stdin");
14697237Stjr			else {
14797237Stjr				if (!(fp = fopen(*argv, "r"))) {
14897237Stjr					warn("%s", *argv);
14997237Stjr					rval = 1;
15097237Stjr					continue;
15197237Stjr				}
15297237Stjr				fcn(fp, *argv);
15397237Stjr				(void)fclose(fp);
15497218Stjr			}
1551590Srgrimes		}
1561590Srgrimes	else
157131194Stjr		rval = fcn(stdin, "stdin");
15897218Stjr	exit(rval);
1591590Srgrimes}
1601590Srgrimes
161227162Sedstatic void
162102944Sdwmaloneget_list(char *list)
1631590Srgrimes{
16471725Swill	size_t setautostart, start, stop;
16543533Seivind	char *pos;
1661590Srgrimes	char *p;
1671590Srgrimes
1681590Srgrimes	/*
1691590Srgrimes	 * set a byte in the positions array to indicate if a field or
1701590Srgrimes	 * column is to be selected; use +1, it's 1-based, not 0-based.
171131196Stjr	 * Numbers and number ranges may be overlapping, repeated, and in
172236866Skevlo	 * any order. We handle "-3-5" although there's no real reason to.
1731590Srgrimes	 */
17443533Seivind	for (; (p = strsep(&list, ", \t")) != NULL;) {
1751590Srgrimes		setautostart = start = stop = 0;
1761590Srgrimes		if (*p == '-') {
1771590Srgrimes			++p;
1781590Srgrimes			setautostart = 1;
1791590Srgrimes		}
18043533Seivind		if (isdigit((unsigned char)*p)) {
1811590Srgrimes			start = stop = strtol(p, &p, 10);
1821590Srgrimes			if (setautostart && start > autostart)
1831590Srgrimes				autostart = start;
1841590Srgrimes		}
1851590Srgrimes		if (*p == '-') {
18643533Seivind			if (isdigit((unsigned char)p[1]))
1871590Srgrimes				stop = strtol(p + 1, &p, 10);
1881590Srgrimes			if (*p == '-') {
1891590Srgrimes				++p;
1901590Srgrimes				if (!autostop || autostop > stop)
1911590Srgrimes					autostop = stop;
1921590Srgrimes			}
1931590Srgrimes		}
1941590Srgrimes		if (*p)
195236866Skevlo			errx(1, "[-bcf] list: illegal list value");
1961590Srgrimes		if (!stop || !start)
197236866Skevlo			errx(1, "[-bcf] list: values may not include zero");
19897234Stjr		if (maxval < stop) {
1991590Srgrimes			maxval = stop;
20097234Stjr			needpos(maxval + 1);
20197234Stjr		}
2021590Srgrimes		for (pos = positions + start; start++ <= stop; *pos++ = 1);
2031590Srgrimes	}
2041590Srgrimes
2051590Srgrimes	/* overlapping ranges */
20697234Stjr	if (autostop && maxval > autostop) {
2071590Srgrimes		maxval = autostop;
20897234Stjr		needpos(maxval + 1);
20997234Stjr	}
2101590Srgrimes
2111590Srgrimes	/* set autostart */
2121590Srgrimes	if (autostart)
2131590Srgrimes		memset(positions + 1, '1', autostart);
2141590Srgrimes}
2151590Srgrimes
216227162Sedstatic void
21797234Stjrneedpos(size_t n)
21897234Stjr{
21997234Stjr	static size_t npos;
22097328Stjr	size_t oldnpos;
22197234Stjr
22297234Stjr	/* Grow the positions array to at least the specified size. */
22397234Stjr	if (n > npos) {
22497328Stjr		oldnpos = npos;
22597234Stjr		if (npos == 0)
22697234Stjr			npos = n;
22797234Stjr		while (n > npos)
22897234Stjr			npos *= 2;
22997234Stjr		if ((positions = realloc(positions, npos)) == NULL)
23097234Stjr			err(1, "realloc");
23197328Stjr		memset((char *)positions + oldnpos, 0, npos - oldnpos);
23297234Stjr	}
23397234Stjr}
23497234Stjr
235227162Sedstatic int
236131201Stjrb_cut(FILE *fp, const char *fname __unused)
237131194Stjr{
238131194Stjr	int ch, col;
239131194Stjr	char *pos;
240131194Stjr
241131194Stjr	ch = 0;
242131194Stjr	for (;;) {
243131194Stjr		pos = positions + 1;
244131194Stjr		for (col = maxval; col; --col) {
245131194Stjr			if ((ch = getc(fp)) == EOF)
246131194Stjr				return (0);
247131194Stjr			if (ch == '\n')
248131194Stjr				break;
249131194Stjr			if (*pos++)
250131194Stjr				(void)putchar(ch);
251131194Stjr		}
252131194Stjr		if (ch != '\n') {
253131194Stjr			if (autostop)
254131194Stjr				while ((ch = getc(fp)) != EOF && ch != '\n')
255131194Stjr					(void)putchar(ch);
256131194Stjr			else
257131194Stjr				while ((ch = getc(fp)) != EOF && ch != '\n');
258131194Stjr		}
259131194Stjr		(void)putchar('\n');
260131194Stjr	}
261131194Stjr	return (0);
262131194Stjr}
263131194Stjr
26498035Stjr/*
26598035Stjr * Cut based on byte positions, taking care not to split multibyte characters.
26698035Stjr * Although this function also handles the case where -n is not specified,
267131194Stjr * b_cut() ought to be much faster.
26898035Stjr */
269227162Sedstatic int
270102944Sdwmaloneb_n_cut(FILE *fp, const char *fname)
27198035Stjr{
27298035Stjr	size_t col, i, lbuflen;
27398035Stjr	char *lbuf;
27498035Stjr	int canwrite, clen, warned;
275131183Stjr	mbstate_t mbs;
27698035Stjr
277131183Stjr	memset(&mbs, 0, sizeof(mbs));
27898035Stjr	warned = 0;
27998035Stjr	while ((lbuf = fgetln(fp, &lbuflen)) != NULL) {
28098035Stjr		for (col = 0; lbuflen > 0; col += clen) {
281131183Stjr			if ((clen = mbrlen(lbuf, lbuflen, &mbs)) < 0) {
28298035Stjr				if (!warned) {
28398035Stjr					warn("%s", fname);
28498035Stjr					warned = 1;
28598035Stjr				}
286131183Stjr				memset(&mbs, 0, sizeof(mbs));
28798035Stjr				clen = 1;
28898035Stjr			}
28998035Stjr			if (clen == 0 || *lbuf == '\n')
29098035Stjr				break;
29198035Stjr			if (col < maxval && !positions[1 + col]) {
29298035Stjr				/*
29398035Stjr				 * Print the character if (1) after an initial
29498035Stjr				 * segment of un-selected bytes, the rest of
29598035Stjr				 * it is selected, and (2) the last byte is
29698035Stjr				 * selected.
29798035Stjr				 */
29898035Stjr				i = col;
29998035Stjr				while (i < col + clen && i < maxval &&
30098035Stjr				    !positions[1 + i])
30198035Stjr					i++;
30298035Stjr				canwrite = i < col + clen;
30398035Stjr				for (; i < col + clen && i < maxval; i++)
30498035Stjr					canwrite &= positions[1 + i];
30598035Stjr				if (canwrite)
30698035Stjr					fwrite(lbuf, 1, clen, stdout);
30798035Stjr			} else {
30898035Stjr				/*
30998035Stjr				 * Print the character if all of it has
31098035Stjr				 * been selected.
31198035Stjr				 */
31298035Stjr				canwrite = 1;
31398035Stjr				for (i = col; i < col + clen; i++)
31498035Stjr					if ((i >= maxval && !autostop) ||
31598035Stjr					    (i < maxval && !positions[1 + i])) {
31698035Stjr						canwrite = 0;
31798035Stjr						break;
31898035Stjr					}
31998035Stjr				if (canwrite)
32098035Stjr					fwrite(lbuf, 1, clen, stdout);
32198035Stjr			}
32298035Stjr			lbuf += clen;
32398035Stjr			lbuflen -= clen;
32498035Stjr		}
32598035Stjr		if (lbuflen > 0)
32698035Stjr			putchar('\n');
32798035Stjr	}
328131194Stjr	return (warned);
32998035Stjr}
33098035Stjr
331227162Sedstatic int
332131194Stjrc_cut(FILE *fp, const char *fname)
3331590Srgrimes{
334131194Stjr	wint_t ch;
335131194Stjr	int col;
33643533Seivind	char *pos;
3371590Srgrimes
33843533Seivind	ch = 0;
3391590Srgrimes	for (;;) {
3401590Srgrimes		pos = positions + 1;
3411590Srgrimes		for (col = maxval; col; --col) {
342131194Stjr			if ((ch = getwc(fp)) == WEOF)
343131194Stjr				goto out;
3441590Srgrimes			if (ch == '\n')
3451590Srgrimes				break;
3461590Srgrimes			if (*pos++)
347131194Stjr				(void)putwchar(ch);
3481590Srgrimes		}
34943533Seivind		if (ch != '\n') {
3501590Srgrimes			if (autostop)
351131194Stjr				while ((ch = getwc(fp)) != WEOF && ch != '\n')
352131194Stjr					(void)putwchar(ch);
3531590Srgrimes			else
354131194Stjr				while ((ch = getwc(fp)) != WEOF && ch != '\n');
35543533Seivind		}
356131194Stjr		(void)putwchar('\n');
3571590Srgrimes	}
358131194Stjrout:
359131194Stjr	if (ferror(fp)) {
360131194Stjr		warn("%s", fname);
361131194Stjr		return (1);
362131194Stjr	}
363131194Stjr	return (0);
3641590Srgrimes}
3651590Srgrimes
366227162Sedstatic int
367243474Sandrewis_delim(wchar_t ch)
368243320Seadler{
369243320Seadler	if (wflag) {
370243320Seadler		if (ch == ' ' || ch == '\t')
371243320Seadler			return 1;
372243320Seadler	} else {
373243320Seadler		if (ch == dchar)
374243320Seadler			return 1;
375243320Seadler	}
376243320Seadler	return 0;
377243320Seadler}
378243320Seadler
379243320Seadlerstatic int
380131197Stjrf_cut(FILE *fp, const char *fname)
3811590Srgrimes{
382131197Stjr	wchar_t ch;
383131197Stjr	int field, i, isdelim;
384131197Stjr	char *pos, *p;
3851590Srgrimes	int output;
38698012Stjr	char *lbuf, *mlbuf;
387137250Stjr	size_t clen, lbuflen, reallen;
3881590Srgrimes
38998012Stjr	mlbuf = NULL;
390243320Seadler	while ((lbuf = fgetln(fp, &lbuflen)) != NULL) {
391137250Stjr		reallen = lbuflen;
39275930Sdd		/* Assert EOL has a newline. */
39375930Sdd		if (*(lbuf + lbuflen - 1) != '\n') {
39475930Sdd			/* Can't have > 1 line with no trailing newline. */
39575930Sdd			mlbuf = malloc(lbuflen + 1);
39675930Sdd			if (mlbuf == NULL)
39775930Sdd				err(1, "malloc");
39875930Sdd			memcpy(mlbuf, lbuf, lbuflen);
39975930Sdd			*(mlbuf + lbuflen) = '\n';
40075930Sdd			lbuf = mlbuf;
401137250Stjr			reallen++;
40275930Sdd		}
4037200Sache		output = 0;
404131197Stjr		for (isdelim = 0, p = lbuf;; p += clen) {
405137250Stjr			clen = mbrtowc(&ch, p, lbuf + reallen - p, NULL);
406131197Stjr			if (clen == (size_t)-1 || clen == (size_t)-2) {
407131197Stjr				warnc(EILSEQ, "%s", fname);
408131197Stjr				free(mlbuf);
409131197Stjr				return (1);
410131197Stjr			}
411131197Stjr			if (clen == 0)
412131197Stjr				clen = 1;
4131590Srgrimes			/* this should work if newline is delimiter */
414243320Seadler			if (is_delim(ch))
4151590Srgrimes				isdelim = 1;
4161590Srgrimes			if (ch == '\n') {
4171590Srgrimes				if (!isdelim && !sflag)
41875930Sdd					(void)fwrite(lbuf, lbuflen, 1, stdout);
4191590Srgrimes				break;
4201590Srgrimes			}
4211590Srgrimes		}
4221590Srgrimes		if (!isdelim)
4231590Srgrimes			continue;
4241590Srgrimes
4251590Srgrimes		pos = positions + 1;
4261590Srgrimes		for (field = maxval, p = lbuf; field; --field, ++pos) {
427131197Stjr			if (*pos && output++)
428131197Stjr				for (i = 0; dcharmb[i] != '\0'; i++)
429131197Stjr					putchar(dcharmb[i]);
430131197Stjr			for (;;) {
431137250Stjr				clen = mbrtowc(&ch, p, lbuf + reallen - p,
432131197Stjr				    NULL);
433131197Stjr				if (clen == (size_t)-1 || clen == (size_t)-2) {
434131197Stjr					warnc(EILSEQ, "%s", fname);
435131197Stjr					free(mlbuf);
436131197Stjr					return (1);
437131197Stjr				}
438131197Stjr				if (clen == 0)
439131197Stjr					clen = 1;
440131197Stjr				p += clen;
441243320Seadler				if (ch == '\n' || is_delim(ch)) {
442243320Seadler					/* compress whitespace */
443243320Seadler					if (wflag && ch != '\n')
444243320Seadler						while (is_delim(*p))
445243320Seadler							p++;
446131197Stjr					break;
447243320Seadler				}
448131197Stjr				if (*pos)
449131197Stjr					for (i = 0; i < (int)clen; i++)
450131197Stjr						putchar(p[i - clen]);
45143533Seivind			}
4521590Srgrimes			if (ch == '\n')
4531590Srgrimes				break;
4541590Srgrimes		}
45543533Seivind		if (ch != '\n') {
4561590Srgrimes			if (autostop) {
4571590Srgrimes				if (output)
458131197Stjr					for (i = 0; dcharmb[i] != '\0'; i++)
459131197Stjr						putchar(dcharmb[i]);
4601590Srgrimes				for (; (ch = *p) != '\n'; ++p)
4611590Srgrimes					(void)putchar(ch);
4621590Srgrimes			} else
4631590Srgrimes				for (; (ch = *p) != '\n'; ++p);
46443533Seivind		}
4651590Srgrimes		(void)putchar('\n');
4661590Srgrimes	}
467131197Stjr	free(mlbuf);
468131194Stjr	return (0);
4691590Srgrimes}
4701590Srgrimes
47127098Scharnierstatic void
472102944Sdwmaloneusage(void)
4731590Srgrimes{
47443531Seivind	(void)fprintf(stderr, "%s\n%s\n%s\n",
47543531Seivind		"usage: cut -b list [-n] [file ...]",
47643531Seivind		"       cut -c list [file ...]",
477243320Seadler		"       cut -f list [-s] [-w | -d delim] [file ...]");
4781590Srgrimes	exit(1);
4791590Srgrimes}
480