fmt.c revision 373:5de22f2b7283
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28/*	  All Rights Reserved	*/
29
30#pragma ident	"%Z%%M%	%I%	%E% SMI"
31
32#include <stdio.h>
33#include <stdlib.h>
34#include <ctype.h>
35#include <wctype.h>
36#include <widec.h>
37#include <dlfcn.h>
38#include <locale.h>
39#include <sys/param.h>
40#include <string.h>
41
42/*
43 * fmt -- format the concatenation of input files or standard input
44 * onto standard output.  Designed for use with Mail ~|
45 *
46 * Syntax: fmt [ -width | -w width ] [ -cs ] [ name ... ]
47 * Author: Kurt Shoens (UCB) 12/7/78
48 */
49
50#define	NOSTR	((wchar_t *)0)	/* Null string pointer for lint */
51#define	MAXLINES	100	/* maximum mail header lines to verify */
52
53wchar_t	outbuf[BUFSIZ];			/* Sandbagged output line image */
54wchar_t	*outp;				/* Pointer in above */
55int	filler;				/* Filler amount in outbuf */
56
57int	pfx;			/* Current leading blank count */
58int	width = 72;		/* Width that we will not exceed */
59int	nojoin = 0;		/* split lines only, don't join short ones */
60int	errs = 0;		/* Current number of errors */
61
62enum crown_type	{c_none, c_reset, c_head, c_lead, c_fixup, c_body};
63enum crown_type	crown_state;	/* Crown margin state */
64int	crown_head;		/* The header offset */
65int	crown_body;		/* The body offset */
66	/* currently-known initial strings found in mail headers */
67wchar_t	*headnames[] = {
68	L"Apparently-To", L"Bcc", L"bcc", L"Cc", L"cc", L"Confirmed-By",
69	L"Content", L"content-length", L"From", L"Date", L"id",
70	L"Message-I", L"MIME-Version", L"Precedence", L"Return-Path",
71	L"Received", L"Reply-To", L"Status", L"Subject", L"To", L"X-IMAP",
72	L"X-Lines", L"X-Sender", L"X-Sun", L"X-Status", L"X-UID",
73	0};
74
75enum hdr_type {
76	off,		/* mail header processing is off */
77	not_in_hdr,	/* not currently processing a mail header */
78	in_hdr, 	/* currently filling hdrbuf with potential hdr lines */
79	flush_hdr,	/* flush hdrbuf; not a header, no special processing */
80	do_hdr		/* process hdrbuf as a mail header */
81};
82				/* current state of hdrbuf */
83enum hdr_type	hdr_state = not_in_hdr;
84
85wchar_t *hdrbuf[MAXLINES];	/* buffer to hold potential mail header lines */
86int 	h_lines;		/* index into lines of hdrbuf */
87
88void (*(split))(wchar_t []);
89extern int scrwidth(wchar_t);
90extern int ishead(char []);
91
92
93static void fill_hdrbuf(wchar_t []);
94static void header_chk(void);
95static void process_hdrbuf(void);
96static void leadin(void);
97static void tabulate(wchar_t []);
98static void oflush(void);
99static void pack(wchar_t []);
100static void msplit(wchar_t []);
101static void csplit(wchar_t []);
102static void _wckind_init(void);
103static void prefix(wchar_t []);
104static void fmt(FILE *);
105static int setopt(char *);
106int _wckind(wchar_t);
107
108/*
109 * Drive the whole formatter by managing input files.  Also,
110 * cause initialization of the output stuff and flush it out
111 * at the end.
112 */
113
114int
115main(int argc, char **argv)
116{
117	FILE *fi;
118	char sobuf[BUFSIZ];
119	char *cp;
120	int nofile;
121	char *locale;
122
123	outp = NOSTR;
124	setbuf(stdout, sobuf);
125	setlocale(LC_ALL, "");
126	locale = setlocale(LC_CTYPE, "");
127	if (strcmp(locale, "C") == 0) {
128		split = csplit;
129	} else {
130		split = msplit;
131		_wckind_init();
132	}
133	if (argc < 2) {
134single:
135		fmt(stdin);
136		oflush();
137		exit(0);
138	}
139	nofile = 1;
140	while (--argc) {
141		cp = *++argv;
142		if (setopt(cp))
143			continue;
144		nofile = 0;
145		if ((fi = fopen(cp, "r")) == NULL) {
146			perror(cp);
147			errs++;
148			continue;
149		}
150		fmt(fi);
151		fclose(fi);
152	}
153	if (nofile)
154		goto single;
155	oflush();
156	return (errs);
157}
158
159/*
160 * Read up characters from the passed input file, forming lines,
161 * doing ^H processing, expanding tabs, stripping trailing blanks,
162 * and sending each line down for analysis.
163 */
164
165static void
166fmt(FILE *fi)
167{
168	wchar_t linebuf[BUFSIZ], canonb[BUFSIZ];
169	wchar_t *cp, *cp2;
170	int col;
171	wchar_t	c;
172	char	cbuf[BUFSIZ];	/* stores wchar_t string as char string */
173
174	c = getwc(fi);
175	while (c != EOF) {
176		/*
177		 * Collect a line, doing ^H processing.
178		 * Leave tabs for now.
179		 */
180
181		cp = linebuf;
182		while (c != L'\n' && c != EOF && cp-linebuf < BUFSIZ-1) {
183			if (c == L'\b') {
184				if (cp > linebuf)
185					cp--;
186				c = getwc(fi);
187				continue;
188			}
189			if (!(iswprint(c)) && c != L'\t') {
190				c = getwc(fi);
191				continue;
192			}
193			*cp++ = c;
194			c = getwc(fi);
195		}
196		*cp = L'\0';
197
198		/*
199		 * Toss anything remaining on the input line.
200		 */
201
202		while (c != L'\n' && c != EOF)
203			c = getwc(fi);
204		/*
205		 * Expand tabs on the way to canonb.
206		 */
207
208		col = 0;
209		cp = linebuf;
210		cp2 = canonb;
211		while (c = *cp++) {
212			if (c != L'\t') {
213				col += scrwidth(c);
214				if (cp2-canonb < BUFSIZ-1)
215					*cp2++ = c;
216				continue;
217			}
218			do {
219				if (cp2-canonb < BUFSIZ-1)
220					*cp2++ = L' ';
221				col++;
222			} while ((col & 07) != 0);
223		}
224
225		/*
226		 * Swipe trailing blanks from the line.
227		 */
228
229		for (cp2--; cp2 >= canonb && *cp2 == L' '; cp2--);
230		*++cp2 = '\0';
231
232			/* special processing to look for mail header lines */
233		switch (hdr_state) {
234		case off:
235			prefix(canonb);
236		case not_in_hdr:
237			/* look for an initial mail header line */
238			/* skip initial blanks */
239			for (cp = canonb; *cp == L' '; cp++);
240			/*
241			 * Need to convert string from wchar_t to char,
242			 * since this is what ishead() expects.  Since we
243			 * only want to make sure cp points to a "From" line
244			 * of the email, we don't have to alloc
245			 * BUFSIZ * MB_LEN_MAX to cbuf.
246			 */
247			wcstombs(cbuf, cp, (BUFSIZ - 1));
248			if (ishead(cbuf)) {
249				hdr_state = in_hdr;
250				fill_hdrbuf(canonb);
251			} else {
252				/* no mail header line; process normally */
253				prefix(canonb);
254			}
255			break;
256		case in_hdr:
257			/* already saw 1st mail header line; look for more */
258			if (canonb[0] == L'\0') {
259				/*
260				 * blank line means end of mail header;
261				 * verify current mail header buffer
262				 * then process it accordingly
263				 */
264				header_chk();
265				process_hdrbuf();
266				/* now process the current blank line */
267				prefix(canonb);
268			} else
269				/*
270				 * not a blank line--save this line as
271				 * a potential mail header line
272				 */
273				fill_hdrbuf(canonb);
274			break;
275		}
276		if (c != EOF)
277			c = getwc(fi);
278	}
279	/*
280	 * end of this file--make sure we process the stuff in
281	 * hdrbuf before we're finished
282	 */
283	if (hdr_state == in_hdr) {
284		header_chk();
285		process_hdrbuf();
286	}
287}
288
289/*
290 * Take a line devoid of tabs and other garbage and determine its
291 * blank prefix.  If the indent changes, call for a linebreak.
292 * If the input line is blank, echo the blank line on the output.
293 * Finally, if the line minus the prefix is a mail header, try to keep
294 * it on a line by itself.
295 */
296
297static void
298prefix(wchar_t line[])
299{
300	wchar_t *cp;
301	int np;
302	int nosplit = 0;	/* flag set if line should not be split */
303
304	if (line[0] == L'\0') {
305		oflush();
306		putchar('\n');
307		if (crown_state != c_none)
308			crown_state = c_reset;
309		return;
310	}
311	for (cp = line; *cp == L' '; cp++);
312	np = cp - line;
313
314	/*
315	 * The following horrible expression attempts to avoid linebreaks
316	 * when the indent changes due to a paragraph.
317	 */
318
319	if (crown_state == c_none && np != pfx && (np > pfx || abs(pfx-np) > 8))
320		oflush();
321	/*
322	 * if this is a mail header line, don't split it; flush previous
323	 * line, if any, so we don't join this line to it
324	 */
325	if (hdr_state == do_hdr) {
326		nosplit = 1;
327		oflush();
328	}
329	/* flush previous line so we don't join this one to it */
330	if (nojoin)
331		oflush();
332	/* nroff-type lines starting with '.' are not split nor joined */
333	if (!nosplit && (nosplit = (*cp == L'.')))
334		oflush();
335	pfx = np;
336	switch (crown_state) {
337	case c_reset:
338		crown_head = pfx;
339		crown_state = c_head;
340		break;
341	case c_lead:
342		crown_body = pfx;
343		crown_state = c_body;
344		break;
345	case c_fixup:
346		crown_body = pfx;
347		crown_state = c_body;
348		if (outp) {
349			wchar_t s[BUFSIZ];
350
351			*outp = L'\0';
352			wscpy(s, &outbuf[crown_head]);
353			outp = NOSTR;
354			split(s);
355		}
356		break;
357	}
358	if (nosplit) {
359		/* put whole input line onto outbuf and print it out */
360		pack(cp);
361		oflush();
362	} else
363		/*
364		 * split puts current line onto outbuf, but splits it
365		 * at word boundaries, if it exceeds desired length
366		 */
367		split(cp);
368	if (nojoin)
369		/*
370		 * flush current line so next lines, if any,
371		 * won't join to this one
372		 */
373		oflush();
374}
375
376/*
377 * Split up the passed line into output "words" which are
378 * maximal strings of non-blanks with the blank separation
379 * attached at the end.  Pass these words along to the output
380 * line packer.
381 */
382
383static void
384csplit(wchar_t line[])
385{
386	wchar_t *cp, *cp2;
387	wchar_t word[BUFSIZ];
388	static const wchar_t *srchlist = (const wchar_t *) L".:!?";
389
390	cp = line;
391	while (*cp) {
392		cp2 = word;
393
394		/*
395		 * Collect a 'word,' allowing it to contain escaped
396		 * white space.
397		 */
398
399		while (*cp && !(iswspace(*cp))) {
400			if (*cp == '\\' && iswspace(cp[1]))
401				*cp2++ = *cp++;
402			*cp2++ = *cp++;
403		}
404
405		/*
406		 * Guarantee a space at end of line.
407		 * Two spaces after end of sentence punctuation.
408		 */
409
410		if (*cp == L'\0') {
411			*cp2++ = L' ';
412			if (wschr(srchlist, cp[-1]) != NULL)
413				*cp2++ = L' ';
414		}
415		while (iswspace(*cp))
416			*cp2++ = *cp++;
417		*cp2 = L'\0';
418		pack(word);
419	}
420}
421
422static void
423msplit(wchar_t line[])
424{
425	wchar_t *cp, *cp2, prev;
426	wchar_t word[BUFSIZ];
427	static const wchar_t *srchlist = (const wchar_t *) L".:!?";
428
429	cp = line;
430	while (*cp) {
431		cp2 = word;
432		prev = *cp;
433
434		/*
435		 * Collect a 'word,' allowing it to contain escaped
436		 * white space.
437		 */
438
439		while (*cp) {
440			if (iswspace(*cp))
441				break;
442			if (_wckind(*cp) != _wckind(prev))
443				if (wcsetno(*cp) != 0 || wcsetno(prev) != 0)
444					break;
445			if (*cp == '\\' && iswspace(cp[1]))
446				*cp2++ = *cp++;
447			prev = *cp;
448			*cp2++ = *cp++;
449		}
450
451		/*
452		 * Guarantee a space at end of line.
453		 * Two spaces after end of sentence punctuation.
454		 */
455
456		if (*cp == L'\0') {
457			*cp2++ = L' ';
458			if (wschr(srchlist, cp[-1]) != NULL)
459				*cp2++ = L' ';
460		}
461		while (iswspace(*cp))
462			*cp2++ = *cp++;
463		*cp2 = L'\0';
464		pack(word);
465	}
466}
467
468/*
469 * Output section.
470 * Build up line images from the words passed in.  Prefix
471 * each line with correct number of blanks.  The buffer "outbuf"
472 * contains the current partial line image, including prefixed blanks.
473 * "outp" points to the next available space therein.  When outp is NOSTR,
474 * there ain't nothing in there yet.  At the bottom of this whole mess,
475 * leading tabs are reinserted.
476 */
477
478/*
479 * Pack a word onto the output line.  If this is the beginning of
480 * the line, push on the appropriately-sized string of blanks first.
481 * If the word won't fit on the current line, flush and begin a new
482 * line.  If the word is too long to fit all by itself on a line,
483 * just give it its own and hope for the best.
484 */
485
486static void
487pack(wchar_t word[])
488{
489	wchar_t *cp;
490	int s, t;
491
492	if (outp == NOSTR)
493		leadin();
494	t = wscol(word);
495	*outp = L'\0';
496	s = wscol(outbuf);
497	if (t+s <= width) {
498		for (cp = word; *cp; *outp++ = *cp++);
499		return;
500	}
501	if (s > filler) {
502		oflush();
503		leadin();
504	}
505	for (cp = word; *cp; *outp++ = *cp++);
506}
507
508/*
509 * If there is anything on the current output line, send it on
510 * its way.  Set outp to NOSTR to indicate the absence of the current
511 * line prefix.
512 */
513
514static void
515oflush(void)
516{
517	if (outp == NOSTR)
518		return;
519	*outp = L'\0';
520	tabulate(outbuf);
521	outp = NOSTR;
522}
523
524/*
525 * Take the passed line buffer, insert leading tabs where possible, and
526 * output on standard output (finally).
527 */
528
529static void
530tabulate(wchar_t line[])
531{
532	wchar_t *cp;
533	int b, t;
534
535
536	/* Toss trailing blanks in the output line */
537	cp = line + wslen(line) - 1;
538	while (cp >= line && *cp == L' ')
539		cp--;
540	*++cp = L'\0';
541	/* Count the leading blank space and tabulate */
542	for (cp = line; *cp == L' '; cp++);
543	b = cp - line;
544	t = b >> 3;
545	b &= 07;
546	if (t > 0)
547		do
548			putc('\t', stdout);
549		while (--t);
550	if (b > 0)
551		do
552			putc(' ', stdout);
553		while (--b);
554	while (*cp)
555		putwc(*cp++, stdout);
556	putc('\n', stdout);
557}
558
559/*
560 * Initialize the output line with the appropriate number of
561 * leading blanks.
562 */
563
564static void
565leadin(void)
566{
567	int b;
568	wchar_t *cp;
569	int l;
570
571	switch (crown_state) {
572	case c_head:
573		l = crown_head;
574		crown_state = c_lead;
575		break;
576
577	case c_lead:
578	case c_fixup:
579		l = crown_head;
580		crown_state = c_fixup;
581		break;
582
583	case c_body:
584		l = crown_body;
585		break;
586
587	default:
588		l = pfx;
589		break;
590	}
591	filler = l;
592	for (b = 0, cp = outbuf; b < l; b++)
593		*cp++ = L' ';
594	outp = cp;
595}
596
597/*
598 * Is s1 a prefix of s2??
599 */
600
601static int
602ispref(wchar_t *s1, wchar_t *s2)
603{
604
605	while (*s1 != L'\0' && *s2 != L'\0')
606		if (*s1++ != *s2++)
607			return (0);
608	return (1);
609}
610
611/*
612 * Set an input option
613 */
614
615static int
616setopt(char *cp)
617{
618	static int ws = 0;
619
620	if (*cp == '-') {
621		if (cp[1] == 'c' && cp[2] == '\0') {
622			crown_state = c_reset;
623			return (1);
624		}
625		if (cp[1] == 's' && cp[2] == '\0') {
626			nojoin = 1;
627			return (1);
628		}
629		if (cp[1] == 'w' && cp[2] == '\0') {
630			ws++;
631			return (1);
632		}
633		width = atoi(cp+1);
634	} else if (ws) {
635		width = atoi(cp);
636		ws = 0;
637	} else
638		return (0);
639	if (width <= 0 || width >= BUFSIZ-2) {
640		fprintf(stderr, "fmt:  bad width: %d\n", width);
641		exit(1);
642	}
643	return (1);
644}
645
646
647#define	LIB_WDRESOLVE	"/usr/lib/locale/%s/LC_CTYPE/wdresolve.so"
648#define	WCHKIND		"_wdchkind_"
649
650static int	_wckind_c_locale(wchar_t);
651
652static int	(*__wckind)(wchar_t) = _wckind_c_locale;
653static void	*dlhandle = NULL;
654
655
656static void
657_wckind_init(void)
658{
659	char	*locale;
660	char	path[MAXPATHLEN + 1];
661
662
663	if (dlhandle != NULL) {
664		(void) dlclose(dlhandle);
665		dlhandle = NULL;
666	}
667
668	locale = setlocale(LC_CTYPE, NULL);
669	if (strcmp(locale, "C") == 0)
670		goto c_locale;
671
672	(void) sprintf(path, LIB_WDRESOLVE, locale);
673
674	if ((dlhandle = dlopen(path, RTLD_LAZY)) != NULL) {
675		__wckind = (int (*)(wchar_t))dlsym(dlhandle, WCHKIND);
676		if (__wckind != NULL)
677			return;
678		(void) dlclose(dlhandle);
679		dlhandle = NULL;
680	}
681
682c_locale:
683	__wckind = _wckind_c_locale;
684}
685
686
687int
688_wckind(wchar_t wc)
689{
690	return (*__wckind) (wc);
691}
692
693
694static int
695_wckind_c_locale(wchar_t wc)
696{
697	int	ret;
698
699	/*
700	 * DEPEND_ON_ANSIC: L notion for the character is new in
701	 * ANSI-C, k&r compiler won't work.
702	 */
703	if (iswascii(wc))
704		ret = (iswalnum(wc) || wc == L'_') ? 0 : 1;
705	else
706		ret = wcsetno(wc) + 1;
707
708	return (ret);
709}
710
711/*
712 * header_chk -
713 * Called when done looking for a set mail header lines.
714 * Either a blank line was seen, or EOF was reached.
715 *
716 * Verifies if current hdrbuf of potential mail header lines
717 * is really a mail header.  A mail header must be at least 2
718 * lines and more than half of them must start with one of the
719 * known mail header strings in headnames.
720 *
721 * header_chk sets hdr_state to do_hdr if hdrbuf contained a valid
722 * mail header.  Otherwise, it sets hdr_state to flush_hdr.
723 *
724 * h_lines = hdrbuf index for next line to be saved;
725 *	     also indicates current # of lines in potential header
726 */
727static void
728header_chk(void)
729{
730	wchar_t  *cp; 		/* ptr to current char of line */
731	wchar_t **hp; 		/* ptr to current char of a valid */
732				/* mail header string */
733	int	  l;		/* index */
734				/*
735				 * number of lines in hdrbuf that look
736				 * like mail header lines (start with
737				 * a known mail header prefix)
738				 */
739	int	 hdrcount = 0;
740		/* header must have at least 2 lines (h_lines > 1) */
741		if (h_lines < 2) {
742			hdr_state = flush_hdr;
743			return;
744		}
745		/*
746		 * go through each line in hdrbuf and see how many
747		 * look like mail header lines
748		 */
749		for (l = 0; l < h_lines; l++) {
750			/* skip initial blanks */
751			for (cp = hdrbuf[l]; *cp == L' '; cp++);
752			for (hp = &headnames[0]; *hp != (wchar_t *)0; hp++)
753				if (ispref(*hp, cp)) {
754					hdrcount++;
755					break;
756				}
757		}
758		/*
759		 * if over half match, we'll assume this is a header;
760		 * set hdr_state to indicate whether to treat
761		 * these lines as mail header (do_hdr) or not (flush_hdr)
762		 */
763		if (hdrcount > h_lines / 2)
764			hdr_state = do_hdr;
765		else
766			hdr_state = flush_hdr;
767}
768
769/*
770 * fill_hdrbuf -
771 * Save given input line into next element of hdrbuf,
772 * as a potential mail header line, to be processed later
773 * once we decide whether or not the contents of hdrbuf is
774 * really a mail header, via header_chk().
775 *
776 * Does not allow hdrbuf to exceed MAXLINES lines.
777 * Dynamically allocates space for each line.  If we are unable
778 * to allocate space for the current string, stop special mail
779 * header preservation at this point and continue formatting
780 * without it.
781 */
782static void
783fill_hdrbuf(wchar_t line[])
784{
785	wchar_t *cp;	/* pointer to characters in input line */
786	int	 i;	/* index into characters a hdrbuf line */
787
788	if (h_lines >= MAXLINES) {
789		/*
790		 * if we run over MAXLINES potential mail header
791		 * lines, stop checking--this is most likely NOT a
792		 * mail header; flush out the hdrbuf, then process
793		 * the current 'line' normally.
794		 */
795		hdr_state = flush_hdr;
796		process_hdrbuf();
797		prefix(line);
798		return;
799	}
800	hdrbuf[h_lines] = (wchar_t *)malloc(sizeof (wchar_t) *
801	    (wslen(line) + 1));
802	if (hdrbuf[h_lines] == NULL) {
803		perror("malloc");
804		fprintf(stderr, "fmt: unable to do mail header preservation\n");
805		errs++;
806		/*
807		 * Can't process mail header; flush current contents
808		 * of mail header and continue with no more mail
809		 * header processing
810		 */
811		if (h_lines == 0)
812			/* hdrbuf is empty; process this line normally */
813			prefix(line);
814		else {
815			hdr_state = flush_hdr;
816			for (i = 0; i < h_lines; i++) {
817				prefix(hdrbuf[i]);
818				free(hdrbuf[i]);
819			}
820			h_lines = 0;
821		}
822		hdr_state = off;
823		return;
824	}
825	/* save this line as a potential mail header line */
826	for (i = 0, cp = line; (hdrbuf[h_lines][i] = *cp) != L'\0'; i++, cp++);
827	h_lines++;
828}
829
830/*
831 * process_hdrbuf -
832 * Outputs the lines currently stored in hdrbuf, according
833 * to the current hdr_state value, assumed to be either do_hdr
834 * or flush_hdr.
835 * This should be called after doing a header_chk() to verify
836 * the hdrbuf and set the hdr_state flag.
837 */
838static void
839process_hdrbuf(void)
840{
841int i;
842
843	for (i = 0; i < h_lines; i++) {
844		prefix(hdrbuf[i]);
845		free(hdrbuf[i]);
846	}
847	hdr_state = not_in_hdr;
848	h_lines = 0;
849}
850