fmt.c revision 1130:357a4bd0e502
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28/*	  All Rights Reserved	*/
29
30#pragma ident	"%Z%%M%	%I%	%E% SMI"
31
32#include <stdio.h>
33#include <stdlib.h>
34#include <ctype.h>
35#include <wctype.h>
36#include <widec.h>
37#include <dlfcn.h>
38#include <locale.h>
39#include <sys/param.h>
40#include <string.h>
41
42/*
43 * fmt -- format the concatenation of input files or standard input
44 * onto standard output.  Designed for use with Mail ~|
45 *
46 * Syntax: fmt [ -width | -w width ] [ -cs ] [ name ... ]
47 * Author: Kurt Shoens (UCB) 12/7/78
48 */
49
50#define	NOSTR	((wchar_t *)0)	/* Null string pointer for lint */
51#define	MAXLINES	100	/* maximum mail header lines to verify */
52
53wchar_t	outbuf[BUFSIZ];			/* Sandbagged output line image */
54wchar_t	*outp;				/* Pointer in above */
55int	filler;				/* Filler amount in outbuf */
56
57int	pfx;			/* Current leading blank count */
58int	width = 72;		/* Width that we will not exceed */
59int	nojoin = 0;		/* split lines only, don't join short ones */
60int	errs = 0;		/* Current number of errors */
61
62enum crown_type	{c_none, c_reset, c_head, c_lead, c_fixup, c_body};
63enum crown_type	crown_state;	/* Crown margin state */
64int	crown_head;		/* The header offset */
65int	crown_body;		/* The body offset */
66	/* currently-known initial strings found in mail headers */
67wchar_t	*headnames[] = {
68	L"Apparently-To", L"Bcc", L"bcc", L"Cc", L"cc", L"Confirmed-By",
69	L"Content", L"content-length", L"From", L"Date", L"id",
70	L"Message-I", L"MIME-Version", L"Precedence", L"Return-Path",
71	L"Received", L"Reply-To", L"Status", L"Subject", L"To", L"X-IMAP",
72	L"X-Lines", L"X-Sender", L"X-Sun", L"X-Status", L"X-UID",
73	0};
74
75enum hdr_type {
76	off,		/* mail header processing is off */
77	not_in_hdr,	/* not currently processing a mail header */
78	in_hdr, 	/* currently filling hdrbuf with potential hdr lines */
79	flush_hdr,	/* flush hdrbuf; not a header, no special processing */
80	do_hdr		/* process hdrbuf as a mail header */
81};
82				/* current state of hdrbuf */
83enum hdr_type	hdr_state = not_in_hdr;
84
85wchar_t *hdrbuf[MAXLINES];	/* buffer to hold potential mail header lines */
86int 	h_lines;		/* index into lines of hdrbuf */
87
88void (*(split))(wchar_t []);
89extern int scrwidth(wchar_t);
90extern int ishead(char []);
91
92
93static void fill_hdrbuf(wchar_t []);
94static void header_chk(void);
95static void process_hdrbuf(void);
96static void leadin(void);
97static void tabulate(wchar_t []);
98static void oflush(void);
99static void pack(wchar_t []);
100static void msplit(wchar_t []);
101static void csplit(wchar_t []);
102static void _wckind_init(void);
103static void prefix(wchar_t []);
104static void fmt(FILE *);
105static int setopt(char *);
106int _wckind(wchar_t);
107
108/*
109 * Drive the whole formatter by managing input files.  Also,
110 * cause initialization of the output stuff and flush it out
111 * at the end.
112 */
113
114int
115main(int argc, char **argv)
116{
117	FILE *fi;
118	char sobuf[BUFSIZ];
119	char *cp;
120	int nofile;
121	char *locale;
122
123	outp = NOSTR;
124	setbuf(stdout, sobuf);
125	setlocale(LC_ALL, "");
126	locale = setlocale(LC_CTYPE, "");
127	if (strcmp(locale, "C") == 0) {
128		split = csplit;
129	} else {
130		split = msplit;
131		_wckind_init();
132	}
133	if (argc < 2) {
134single:
135		fmt(stdin);
136		oflush();
137		exit(0);
138	}
139	nofile = 1;
140	while (--argc) {
141		cp = *++argv;
142		if (setopt(cp))
143			continue;
144		nofile = 0;
145		if ((fi = fopen(cp, "r")) == NULL) {
146			perror(cp);
147			errs++;
148			continue;
149		}
150		fmt(fi);
151		fclose(fi);
152	}
153	if (nofile)
154		goto single;
155	oflush();
156	fclose(stdout);
157	return (errs);
158}
159
160/*
161 * Read up characters from the passed input file, forming lines,
162 * doing ^H processing, expanding tabs, stripping trailing blanks,
163 * and sending each line down for analysis.
164 */
165
166static void
167fmt(FILE *fi)
168{
169	wchar_t linebuf[BUFSIZ], canonb[BUFSIZ];
170	wchar_t *cp, *cp2;
171	int col;
172	wchar_t	c;
173	char	cbuf[BUFSIZ];	/* stores wchar_t string as char string */
174
175	c = getwc(fi);
176	while (c != EOF) {
177		/*
178		 * Collect a line, doing ^H processing.
179		 * Leave tabs for now.
180		 */
181
182		cp = linebuf;
183		while (c != L'\n' && c != EOF && cp-linebuf < BUFSIZ-1) {
184			if (c == L'\b') {
185				if (cp > linebuf)
186					cp--;
187				c = getwc(fi);
188				continue;
189			}
190			if (!(iswprint(c)) && c != L'\t') {
191				c = getwc(fi);
192				continue;
193			}
194			*cp++ = c;
195			c = getwc(fi);
196		}
197		*cp = L'\0';
198
199		/*
200		 * Toss anything remaining on the input line.
201		 */
202
203		while (c != L'\n' && c != EOF)
204			c = getwc(fi);
205		/*
206		 * Expand tabs on the way to canonb.
207		 */
208
209		col = 0;
210		cp = linebuf;
211		cp2 = canonb;
212		while (c = *cp++) {
213			if (c != L'\t') {
214				col += scrwidth(c);
215				if (cp2-canonb < BUFSIZ-1)
216					*cp2++ = c;
217				continue;
218			}
219			do {
220				if (cp2-canonb < BUFSIZ-1)
221					*cp2++ = L' ';
222				col++;
223			} while ((col & 07) != 0);
224		}
225
226		/*
227		 * Swipe trailing blanks from the line.
228		 */
229
230		for (cp2--; cp2 >= canonb && *cp2 == L' '; cp2--);
231		*++cp2 = '\0';
232
233			/* special processing to look for mail header lines */
234		switch (hdr_state) {
235		case off:
236			prefix(canonb);
237		case not_in_hdr:
238			/* look for an initial mail header line */
239			/* skip initial blanks */
240			for (cp = canonb; *cp == L' '; cp++);
241			/*
242			 * Need to convert string from wchar_t to char,
243			 * since this is what ishead() expects.  Since we
244			 * only want to make sure cp points to a "From" line
245			 * of the email, we don't have to alloc
246			 * BUFSIZ * MB_LEN_MAX to cbuf.
247			 */
248			wcstombs(cbuf, cp, (BUFSIZ - 1));
249			if (ishead(cbuf)) {
250				hdr_state = in_hdr;
251				fill_hdrbuf(canonb);
252			} else {
253				/* no mail header line; process normally */
254				prefix(canonb);
255			}
256			break;
257		case in_hdr:
258			/* already saw 1st mail header line; look for more */
259			if (canonb[0] == L'\0') {
260				/*
261				 * blank line means end of mail header;
262				 * verify current mail header buffer
263				 * then process it accordingly
264				 */
265				header_chk();
266				process_hdrbuf();
267				/* now process the current blank line */
268				prefix(canonb);
269			} else
270				/*
271				 * not a blank line--save this line as
272				 * a potential mail header line
273				 */
274				fill_hdrbuf(canonb);
275			break;
276		}
277		if (c != EOF)
278			c = getwc(fi);
279	}
280	/*
281	 * end of this file--make sure we process the stuff in
282	 * hdrbuf before we're finished
283	 */
284	if (hdr_state == in_hdr) {
285		header_chk();
286		process_hdrbuf();
287	}
288}
289
290/*
291 * Take a line devoid of tabs and other garbage and determine its
292 * blank prefix.  If the indent changes, call for a linebreak.
293 * If the input line is blank, echo the blank line on the output.
294 * Finally, if the line minus the prefix is a mail header, try to keep
295 * it on a line by itself.
296 */
297
298static void
299prefix(wchar_t line[])
300{
301	wchar_t *cp;
302	int np;
303	int nosplit = 0;	/* flag set if line should not be split */
304
305	if (line[0] == L'\0') {
306		oflush();
307		putchar('\n');
308		if (crown_state != c_none)
309			crown_state = c_reset;
310		return;
311	}
312	for (cp = line; *cp == L' '; cp++);
313	np = cp - line;
314
315	/*
316	 * The following horrible expression attempts to avoid linebreaks
317	 * when the indent changes due to a paragraph.
318	 */
319
320	if (crown_state == c_none && np != pfx && (np > pfx || abs(pfx-np) > 8))
321		oflush();
322	/*
323	 * if this is a mail header line, don't split it; flush previous
324	 * line, if any, so we don't join this line to it
325	 */
326	if (hdr_state == do_hdr) {
327		nosplit = 1;
328		oflush();
329	}
330	/* flush previous line so we don't join this one to it */
331	if (nojoin)
332		oflush();
333	/* nroff-type lines starting with '.' are not split nor joined */
334	if (!nosplit && (nosplit = (*cp == L'.')))
335		oflush();
336	pfx = np;
337	switch (crown_state) {
338	case c_reset:
339		crown_head = pfx;
340		crown_state = c_head;
341		break;
342	case c_lead:
343		crown_body = pfx;
344		crown_state = c_body;
345		break;
346	case c_fixup:
347		crown_body = pfx;
348		crown_state = c_body;
349		if (outp) {
350			wchar_t s[BUFSIZ];
351
352			*outp = L'\0';
353			wscpy(s, &outbuf[crown_head]);
354			outp = NOSTR;
355			split(s);
356		}
357		break;
358	}
359	if (nosplit) {
360		/* put whole input line onto outbuf and print it out */
361		pack(cp);
362		oflush();
363	} else
364		/*
365		 * split puts current line onto outbuf, but splits it
366		 * at word boundaries, if it exceeds desired length
367		 */
368		split(cp);
369	if (nojoin)
370		/*
371		 * flush current line so next lines, if any,
372		 * won't join to this one
373		 */
374		oflush();
375}
376
377/*
378 * Split up the passed line into output "words" which are
379 * maximal strings of non-blanks with the blank separation
380 * attached at the end.  Pass these words along to the output
381 * line packer.
382 */
383
384static void
385csplit(wchar_t line[])
386{
387	wchar_t *cp, *cp2;
388	wchar_t word[BUFSIZ];
389	static const wchar_t *srchlist = (const wchar_t *) L".:!?";
390
391	cp = line;
392	while (*cp) {
393		cp2 = word;
394
395		/*
396		 * Collect a 'word,' allowing it to contain escaped
397		 * white space.
398		 */
399
400		while (*cp && !(iswspace(*cp))) {
401			if (*cp == '\\' && iswspace(cp[1]))
402				*cp2++ = *cp++;
403			*cp2++ = *cp++;
404		}
405
406		/*
407		 * Guarantee a space at end of line.
408		 * Two spaces after end of sentence punctuation.
409		 */
410
411		if (*cp == L'\0') {
412			*cp2++ = L' ';
413			if (wschr(srchlist, cp[-1]) != NULL)
414				*cp2++ = L' ';
415		}
416		while (iswspace(*cp))
417			*cp2++ = *cp++;
418		*cp2 = L'\0';
419		pack(word);
420	}
421}
422
423static void
424msplit(wchar_t line[])
425{
426	wchar_t *cp, *cp2, prev;
427	wchar_t word[BUFSIZ];
428	static const wchar_t *srchlist = (const wchar_t *) L".:!?";
429
430	cp = line;
431	while (*cp) {
432		cp2 = word;
433		prev = *cp;
434
435		/*
436		 * Collect a 'word,' allowing it to contain escaped
437		 * white space.
438		 */
439
440		while (*cp) {
441			if (iswspace(*cp))
442				break;
443			if (_wckind(*cp) != _wckind(prev))
444				if (wcsetno(*cp) != 0 || wcsetno(prev) != 0)
445					break;
446			if (*cp == '\\' && iswspace(cp[1]))
447				*cp2++ = *cp++;
448			prev = *cp;
449			*cp2++ = *cp++;
450		}
451
452		/*
453		 * Guarantee a space at end of line.
454		 * Two spaces after end of sentence punctuation.
455		 */
456
457		if (*cp == L'\0') {
458			*cp2++ = L' ';
459			if (wschr(srchlist, cp[-1]) != NULL)
460				*cp2++ = L' ';
461		}
462		while (iswspace(*cp))
463			*cp2++ = *cp++;
464		*cp2 = L'\0';
465		pack(word);
466	}
467}
468
469/*
470 * Output section.
471 * Build up line images from the words passed in.  Prefix
472 * each line with correct number of blanks.  The buffer "outbuf"
473 * contains the current partial line image, including prefixed blanks.
474 * "outp" points to the next available space therein.  When outp is NOSTR,
475 * there ain't nothing in there yet.  At the bottom of this whole mess,
476 * leading tabs are reinserted.
477 */
478
479/*
480 * Pack a word onto the output line.  If this is the beginning of
481 * the line, push on the appropriately-sized string of blanks first.
482 * If the word won't fit on the current line, flush and begin a new
483 * line.  If the word is too long to fit all by itself on a line,
484 * just give it its own and hope for the best.
485 */
486
487static void
488pack(wchar_t word[])
489{
490	wchar_t *cp;
491	int s, t;
492
493	if (outp == NOSTR)
494		leadin();
495	t = wscol(word);
496	*outp = L'\0';
497	s = wscol(outbuf);
498	if (t+s <= width) {
499		for (cp = word; *cp; *outp++ = *cp++);
500		return;
501	}
502	if (s > filler) {
503		oflush();
504		leadin();
505	}
506	for (cp = word; *cp; *outp++ = *cp++);
507}
508
509/*
510 * If there is anything on the current output line, send it on
511 * its way.  Set outp to NOSTR to indicate the absence of the current
512 * line prefix.
513 */
514
515static void
516oflush(void)
517{
518	if (outp == NOSTR)
519		return;
520	*outp = L'\0';
521	tabulate(outbuf);
522	outp = NOSTR;
523}
524
525/*
526 * Take the passed line buffer, insert leading tabs where possible, and
527 * output on standard output (finally).
528 */
529
530static void
531tabulate(wchar_t line[])
532{
533	wchar_t *cp;
534	int b, t;
535
536
537	/* Toss trailing blanks in the output line */
538	cp = line + wslen(line) - 1;
539	while (cp >= line && *cp == L' ')
540		cp--;
541	*++cp = L'\0';
542	/* Count the leading blank space and tabulate */
543	for (cp = line; *cp == L' '; cp++);
544	b = cp - line;
545	t = b >> 3;
546	b &= 07;
547	if (t > 0)
548		do
549			putc('\t', stdout);
550		while (--t);
551	if (b > 0)
552		do
553			putc(' ', stdout);
554		while (--b);
555	while (*cp)
556		putwc(*cp++, stdout);
557	putc('\n', stdout);
558}
559
560/*
561 * Initialize the output line with the appropriate number of
562 * leading blanks.
563 */
564
565static void
566leadin(void)
567{
568	int b;
569	wchar_t *cp;
570	int l;
571
572	switch (crown_state) {
573	case c_head:
574		l = crown_head;
575		crown_state = c_lead;
576		break;
577
578	case c_lead:
579	case c_fixup:
580		l = crown_head;
581		crown_state = c_fixup;
582		break;
583
584	case c_body:
585		l = crown_body;
586		break;
587
588	default:
589		l = pfx;
590		break;
591	}
592	filler = l;
593	for (b = 0, cp = outbuf; b < l; b++)
594		*cp++ = L' ';
595	outp = cp;
596}
597
598/*
599 * Is s1 a prefix of s2??
600 */
601
602static int
603ispref(wchar_t *s1, wchar_t *s2)
604{
605
606	while (*s1 != L'\0' && *s2 != L'\0')
607		if (*s1++ != *s2++)
608			return (0);
609	return (1);
610}
611
612/*
613 * Set an input option
614 */
615
616static int
617setopt(char *cp)
618{
619	static int ws = 0;
620
621	if (*cp == '-') {
622		if (cp[1] == 'c' && cp[2] == '\0') {
623			crown_state = c_reset;
624			return (1);
625		}
626		if (cp[1] == 's' && cp[2] == '\0') {
627			nojoin = 1;
628			return (1);
629		}
630		if (cp[1] == 'w' && cp[2] == '\0') {
631			ws++;
632			return (1);
633		}
634		width = atoi(cp+1);
635	} else if (ws) {
636		width = atoi(cp);
637		ws = 0;
638	} else
639		return (0);
640	if (width <= 0 || width >= BUFSIZ-2) {
641		fprintf(stderr, "fmt:  bad width: %d\n", width);
642		exit(1);
643	}
644	return (1);
645}
646
647
648#define	LIB_WDRESOLVE	"/usr/lib/locale/%s/LC_CTYPE/wdresolve.so"
649#define	WCHKIND		"_wdchkind_"
650
651static int	_wckind_c_locale(wchar_t);
652
653static int	(*__wckind)(wchar_t) = _wckind_c_locale;
654static void	*dlhandle = NULL;
655
656
657static void
658_wckind_init(void)
659{
660	char	*locale;
661	char	path[MAXPATHLEN + 1];
662
663
664	if (dlhandle != NULL) {
665		(void) dlclose(dlhandle);
666		dlhandle = NULL;
667	}
668
669	locale = setlocale(LC_CTYPE, NULL);
670	if (strcmp(locale, "C") == 0)
671		goto c_locale;
672
673	(void) sprintf(path, LIB_WDRESOLVE, locale);
674
675	if ((dlhandle = dlopen(path, RTLD_LAZY)) != NULL) {
676		__wckind = (int (*)(wchar_t))dlsym(dlhandle, WCHKIND);
677		if (__wckind != NULL)
678			return;
679		(void) dlclose(dlhandle);
680		dlhandle = NULL;
681	}
682
683c_locale:
684	__wckind = _wckind_c_locale;
685}
686
687
688int
689_wckind(wchar_t wc)
690{
691	return (*__wckind) (wc);
692}
693
694
695static int
696_wckind_c_locale(wchar_t wc)
697{
698	int	ret;
699
700	/*
701	 * DEPEND_ON_ANSIC: L notion for the character is new in
702	 * ANSI-C, k&r compiler won't work.
703	 */
704	if (iswascii(wc))
705		ret = (iswalnum(wc) || wc == L'_') ? 0 : 1;
706	else
707		ret = wcsetno(wc) + 1;
708
709	return (ret);
710}
711
712/*
713 * header_chk -
714 * Called when done looking for a set mail header lines.
715 * Either a blank line was seen, or EOF was reached.
716 *
717 * Verifies if current hdrbuf of potential mail header lines
718 * is really a mail header.  A mail header must be at least 2
719 * lines and more than half of them must start with one of the
720 * known mail header strings in headnames.
721 *
722 * header_chk sets hdr_state to do_hdr if hdrbuf contained a valid
723 * mail header.  Otherwise, it sets hdr_state to flush_hdr.
724 *
725 * h_lines = hdrbuf index for next line to be saved;
726 *	     also indicates current # of lines in potential header
727 */
728static void
729header_chk(void)
730{
731	wchar_t  *cp; 		/* ptr to current char of line */
732	wchar_t **hp; 		/* ptr to current char of a valid */
733				/* mail header string */
734	int	  l;		/* index */
735				/*
736				 * number of lines in hdrbuf that look
737				 * like mail header lines (start with
738				 * a known mail header prefix)
739				 */
740	int	 hdrcount = 0;
741		/* header must have at least 2 lines (h_lines > 1) */
742		if (h_lines < 2) {
743			hdr_state = flush_hdr;
744			return;
745		}
746		/*
747		 * go through each line in hdrbuf and see how many
748		 * look like mail header lines
749		 */
750		for (l = 0; l < h_lines; l++) {
751			/* skip initial blanks */
752			for (cp = hdrbuf[l]; *cp == L' '; cp++);
753			for (hp = &headnames[0]; *hp != (wchar_t *)0; hp++)
754				if (ispref(*hp, cp)) {
755					hdrcount++;
756					break;
757				}
758		}
759		/*
760		 * if over half match, we'll assume this is a header;
761		 * set hdr_state to indicate whether to treat
762		 * these lines as mail header (do_hdr) or not (flush_hdr)
763		 */
764		if (hdrcount > h_lines / 2)
765			hdr_state = do_hdr;
766		else
767			hdr_state = flush_hdr;
768}
769
770/*
771 * fill_hdrbuf -
772 * Save given input line into next element of hdrbuf,
773 * as a potential mail header line, to be processed later
774 * once we decide whether or not the contents of hdrbuf is
775 * really a mail header, via header_chk().
776 *
777 * Does not allow hdrbuf to exceed MAXLINES lines.
778 * Dynamically allocates space for each line.  If we are unable
779 * to allocate space for the current string, stop special mail
780 * header preservation at this point and continue formatting
781 * without it.
782 */
783static void
784fill_hdrbuf(wchar_t line[])
785{
786	wchar_t *cp;	/* pointer to characters in input line */
787	int	 i;	/* index into characters a hdrbuf line */
788
789	if (h_lines >= MAXLINES) {
790		/*
791		 * if we run over MAXLINES potential mail header
792		 * lines, stop checking--this is most likely NOT a
793		 * mail header; flush out the hdrbuf, then process
794		 * the current 'line' normally.
795		 */
796		hdr_state = flush_hdr;
797		process_hdrbuf();
798		prefix(line);
799		return;
800	}
801	hdrbuf[h_lines] = (wchar_t *)malloc(sizeof (wchar_t) *
802	    (wslen(line) + 1));
803	if (hdrbuf[h_lines] == NULL) {
804		perror("malloc");
805		fprintf(stderr, "fmt: unable to do mail header preservation\n");
806		errs++;
807		/*
808		 * Can't process mail header; flush current contents
809		 * of mail header and continue with no more mail
810		 * header processing
811		 */
812		if (h_lines == 0)
813			/* hdrbuf is empty; process this line normally */
814			prefix(line);
815		else {
816			hdr_state = flush_hdr;
817			for (i = 0; i < h_lines; i++) {
818				prefix(hdrbuf[i]);
819				free(hdrbuf[i]);
820			}
821			h_lines = 0;
822		}
823		hdr_state = off;
824		return;
825	}
826	/* save this line as a potential mail header line */
827	for (i = 0, cp = line; (hdrbuf[h_lines][i] = *cp) != L'\0'; i++, cp++);
828	h_lines++;
829}
830
831/*
832 * process_hdrbuf -
833 * Outputs the lines currently stored in hdrbuf, according
834 * to the current hdr_state value, assumed to be either do_hdr
835 * or flush_hdr.
836 * This should be called after doing a header_chk() to verify
837 * the hdrbuf and set the hdr_state flag.
838 */
839static void
840process_hdrbuf(void)
841{
842int i;
843
844	for (i = 0; i < h_lines; i++) {
845		prefix(hdrbuf[i]);
846		free(hdrbuf[i]);
847	}
848	hdr_state = not_in_hdr;
849	h_lines = 0;
850}
851