1/* vi: set sw=4 ts=4: */
2/*
3 * wc implementation for busybox
4 *
5 * Copyright (C) 2003  Manuel Novoa III  <mjn3@codepoet.org>
6 *
7 * Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
8 */
9
10/* BB_AUDIT SUSv3 compliant. */
11/* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */
12
13/* Mar 16, 2003      Manuel Novoa III   (mjn3@codepoet.org)
14 *
15 * Rewritten to fix a number of problems and do some size optimizations.
16 * Problems in the previous busybox implementation (besides bloat) included:
17 *  1) broken 'wc -c' optimization (read note below)
18 *  2) broken handling of '-' args
19 *  3) no checking of ferror on EOF returns
20 *  4) isprint() wasn't considered when word counting.
21 *
22 * NOTES:
23 *
24 * The previous busybox wc attempted an optimization using stat for the
25 * case of counting chars only.  I omitted that because it was broken.
26 * It didn't take into account the possibility of input coming from a
27 * pipe, or input from a file with file pointer not at the beginning.
28 *
29 * To implement such a speed optimization correctly, not only do you
30 * need the size, but also the file position.  Note also that the
31 * file position may be past the end of file.  Consider the example
32 * (adapted from example in gnu wc.c)
33 *
34 *      echo hello > /tmp/testfile &&
35 *      (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile
36 *
37 * for which 'wc -c' should output '0'.
38 */
39#include "libbb.h"
40#include "unicode.h"
41
42#if !ENABLE_LOCALE_SUPPORT
43# undef isprint
44# undef isspace
45# define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20))
46# define isspace(c) ((c) == ' ')
47#endif
48
49#if ENABLE_FEATURE_WC_LARGE
50# define COUNT_T unsigned long long
51# define COUNT_FMT "llu"
52#else
53# define COUNT_T unsigned
54# define COUNT_FMT "u"
55#endif
56
57/* We support -m even when UNICODE_SUPPORT is off,
58 * we just don't advertise it in help text,
59 * since it is the same as -c in this case.
60 */
61
62//usage:#define wc_trivial_usage
63//usage:       "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..."
64//usage:
65//usage:#define wc_full_usage "\n\n"
66//usage:       "Count lines, words, and bytes for each FILE (or stdin)\n"
67//usage:     "\nOptions:"
68//usage:     "\n	-c	Count bytes"
69//usage:	IF_UNICODE_SUPPORT(
70//usage:     "\n	-m	Count characters"
71//usage:	)
72//usage:     "\n	-l	Count newlines"
73//usage:     "\n	-w	Count words"
74//usage:     "\n	-L	Print longest line length"
75//usage:
76//usage:#define wc_example_usage
77//usage:       "$ wc /etc/passwd\n"
78//usage:       "     31      46    1365 /etc/passwd\n"
79
80/* Order is important if we want to be compatible with
81 * column order in "wc -cmlwL" output:
82 */
83enum {
84	WC_LINES    = 0,
85	WC_WORDS    = 1,
86	WC_UNICHARS = 2,
87	WC_CHARS    = 3,
88	WC_LENGTH   = 4,
89	NUM_WCS     = 5,
90};
91
92int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
93int wc_main(int argc UNUSED_PARAM, char **argv)
94{
95	const char *arg;
96	const char *start_fmt = " %9"COUNT_FMT + 1;
97	const char *fname_fmt = " %s\n";
98	COUNT_T *pcounts;
99	COUNT_T counts[NUM_WCS];
100	COUNT_T totals[NUM_WCS];
101	int num_files;
102	smallint status = EXIT_SUCCESS;
103	unsigned print_type;
104
105	init_unicode();
106
107	print_type = getopt32(argv, "lwcmL");
108
109	if (print_type == 0) {
110		print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_CHARS);
111	}
112
113	argv += optind;
114	if (!argv[0]) {
115		*--argv = (char *) bb_msg_standard_input;
116		fname_fmt = "\n";
117	}
118	if (!argv[1]) { /* zero or one filename? */
119		if (!((print_type-1) & print_type)) /* exactly one option? */
120			start_fmt = "%"COUNT_FMT;
121	}
122
123	memset(totals, 0, sizeof(totals));
124
125	pcounts = counts;
126
127	num_files = 0;
128	while ((arg = *argv++) != NULL) {
129		FILE *fp;
130		const char *s;
131		unsigned u;
132		unsigned linepos;
133		smallint in_word;
134
135		++num_files;
136		fp = fopen_or_warn_stdin(arg);
137		if (!fp) {
138			status = EXIT_FAILURE;
139			continue;
140		}
141
142		memset(counts, 0, sizeof(counts));
143		linepos = 0;
144		in_word = 0;
145
146		while (1) {
147			int c;
148			/* Our -w doesn't match GNU wc exactly... oh well */
149
150			c = getc(fp);
151			if (c == EOF) {
152				if (ferror(fp)) {
153					bb_simple_perror_msg(arg);
154					status = EXIT_FAILURE;
155				}
156				goto DO_EOF;		/* Treat an EOF as '\r'. */
157			}
158
159			/* Cater for -c and -m */
160			++counts[WC_CHARS];
161			if (unicode_status != UNICODE_ON /* every byte is a new char */
162			 || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */
163			) {
164				++counts[WC_UNICHARS];
165			}
166
167			if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */
168				++linepos;
169				if (!isspace(c)) {
170					in_word = 1;
171					continue;
172				}
173			} else if ((unsigned)(c - 9) <= 4) {
174				/* \t  9
175				 * \n 10
176				 * \v 11
177				 * \f 12
178				 * \r 13
179				 */
180				if (c == '\t') {
181					linepos = (linepos | 7) + 1;
182				} else {			/* '\n', '\r', '\f', or '\v' */
183 DO_EOF:
184					if (linepos > counts[WC_LENGTH]) {
185						counts[WC_LENGTH] = linepos;
186					}
187					if (c == '\n') {
188						++counts[WC_LINES];
189					}
190					if (c != '\v') {
191						linepos = 0;
192					}
193				}
194			} else {
195				continue;
196			}
197
198			counts[WC_WORDS] += in_word;
199			in_word = 0;
200			if (c == EOF) {
201				break;
202			}
203		}
204
205		fclose_if_not_stdin(fp);
206
207		if (totals[WC_LENGTH] < counts[WC_LENGTH]) {
208			totals[WC_LENGTH] = counts[WC_LENGTH];
209		}
210		totals[WC_LENGTH] -= counts[WC_LENGTH];
211
212 OUTPUT:
213		/* coreutils wc tries hard to print pretty columns
214		 * (saves results for all files, finds max col len etc...)
215		 * we won't try that hard, it will bloat us too much */
216		s = start_fmt;
217		u = 0;
218		do {
219			if (print_type & (1 << u)) {
220				printf(s, pcounts[u]);
221				s = " %9"COUNT_FMT; /* Ok... restore the leading space. */
222			}
223			totals[u] += pcounts[u];
224		} while (++u < NUM_WCS);
225		printf(fname_fmt, arg);
226	}
227
228	/* If more than one file was processed, we want the totals.  To save some
229	 * space, we set the pcounts ptr to the totals array.  This has the side
230	 * effect of trashing the totals array after outputting it, but that's
231	 * irrelavent since we no longer need it. */
232	if (num_files > 1) {
233		num_files = 0;				/* Make sure we don't get here again. */
234		arg = "total";
235		pcounts = totals;
236		--argv;
237		goto OUTPUT;
238	}
239
240	fflush_stdout_and_exit(status);
241}
242