1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1992-2012 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                 Eclipse Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*          http://www.eclipse.org/org/documents/epl-v10.html           *
11*         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                                                                      *
20***********************************************************************/
21#pragma prototyped
22
23static const char usage[] =
24"[-?\n@(#)$Id: fmt (AT&T Research) 2007-01-02 $\n]"
25USAGE_LICENSE
26"[+NAME?fmt - simple text formatter]"
27"[+DESCRIPTION?\bfmt\b reads the input files and left justifies space "
28    "separated words into lines \awidth\a characters or less in length and "
29    "writes the lines to the standard output. The standard input is read if "
30    "\b-\b or no files are specified. Blank lines and interword spacing are "
31    "preserved in the output. Indentation is preserved, and lines with "
32    "identical indentation are joined and justified.]"
33"[+?\bfmt\b is meant to format mail messages prior to sending, but may "
34    "also be useful for other simple tasks. For example, in \bvi\b(1) the "
35    "command \b:!}fmt\b will justify the lines in the current paragraph.]"
36"[c:crown-margin?Preserve the indentation of the first two lines within "
37    "a paragraph, and align the left margin of each subsequent line with "
38    "that of the second line.]"
39"[o:optget?Format concatenated \boptget\b(3) usage strings.]"
40"[s:split-only?Split lines only; do not join short lines to form longer "
41    "ones.]"
42"[u:uniform-spacing?One space between words, two after sentences.]"
43"[w:width?Set the output line width to \acolumns\a.]#[columns:=72]"
44    "\n\n"
45"[ file ... ]"
46    "\n\n"
47"[+SEE ALSO?\bmailx\b(1), \bnroff\b(1), \btroff\b(1), \bvi\b(1), "
48    "\boptget\b(3)]"
49;
50
51#include <cmd.h>
52#include <ctype.h>
53
54typedef struct Fmt_s
55{
56	long	flags;
57	char*	outp;
58	char*	outbuf;
59	char*	endbuf;
60	Sfio_t*	in;
61	Sfio_t*	out;
62	int	indent;
63	int	nextdent;
64	int	nwords;
65	int	prefix;
66	int	quote;
67	int	retain;
68	int	section;
69} Fmt_t;
70
71#define INDENT		4
72#define TABSZ		8
73
74#define isoption(fp,c)	((fp)->flags&(1L<<((c)-'a')))
75#define setoption(fp,c)	((fp)->flags|=(1L<<((c)-'a')))
76#define clroption(fp,c)	((fp)->flags&=~(1L<<((c)-'a')))
77
78static void
79outline(Fmt_t* fp)
80{
81	register char*	cp = fp->outbuf;
82	int		n = 0;
83	int		c;
84	int		d;
85
86	if (!fp->outp)
87		return;
88	while (fp->outp[-1] == ' ')
89		fp->outp--;
90	*fp->outp = 0;
91	while (*cp++ == ' ')
92		n++;
93	if (n >= TABSZ)
94	{
95		n /= TABSZ;
96		cp = &fp->outbuf[TABSZ*n];
97		while (n--)
98			*--cp = '\t';
99	}
100	else
101		cp = fp->outbuf;
102	fp->nwords = 0;
103	if (!isoption(fp, 'o'))
104		sfputr(fp->out, cp, '\n');
105	else if (*cp)
106	{
107		n = fp->indent;
108		if (*cp != '[')
109		{
110			if (*cp == ' ')
111				cp++;
112			n += INDENT;
113		}
114		while (n--)
115			sfputc(fp->out, ' ');
116		if (fp->quote)
117		{
118			if ((d = (fp->outp - cp)) <= 0)
119				c = 0;
120			else if ((c = fp->outp[-1]) == 'n' && d > 1 && fp->outp[-2] == '\\')
121				c = '}';
122			sfprintf(fp->out, "\"%s%s\"\n", cp, c == ']' || c == '{' || c == '}' ? "" : " ");
123		}
124		else
125			sfputr(fp->out, cp, '\n');
126		if (fp->nextdent)
127		{
128			fp->indent += fp->nextdent;
129			fp->endbuf -= fp->nextdent;
130			fp->nextdent = 0;
131		}
132	}
133	fp->outp = 0;
134}
135
136static void
137split(Fmt_t* fp, char* buf, int splice)
138{
139	register char*	cp;
140	register char*	ep;
141	register char*	qp;
142	register int	c = 1;
143	register int	q = 0;
144	register int	n;
145	int		prefix;
146
147	for (ep = buf; *ep == ' '; ep++);
148	prefix = ep - buf;
149
150	/*
151	 * preserve blank lines
152	 */
153
154	if ((*ep == 0 || *buf == '.') && !isoption(fp, 'o'))
155	{
156		if (*ep)
157			prefix = strlen(buf);
158		outline(fp);
159		strcpy(fp->outbuf, buf);
160		fp->outp = fp->outbuf+prefix;
161		outline(fp);
162		return;
163	}
164	if (fp->prefix < prefix && !isoption(fp, 'c'))
165		outline(fp);
166	if (!fp->outp || prefix < fp->prefix)
167		fp->prefix = prefix;
168	while (c)
169	{
170		cp = ep;
171		while (*ep == ' ')
172			ep++;
173		if (cp != ep && isoption(fp, 'u'))
174			cp = ep-1;
175		while (c = *ep)
176		{
177			if (c == ' ')
178				break;
179			ep++;
180
181			/*
182			 * skip over \space
183			 */
184
185			if (c == '\\' && *ep)
186				ep++;
187		}
188		n = (ep-cp);
189		if (n && isoption(fp, 'o'))
190		{
191			for (qp = cp; qp < ep; qp++)
192				if (*qp == '\\')
193					qp++;
194				else if (*qp == '"')
195					q = !q;
196			if (*(ep-1) == '"')
197				goto skip;
198		}
199		if (fp->nwords > 0 && &fp->outp[n] >= fp->endbuf && !fp->retain && !q)
200			outline(fp);
201	skip:
202		if (fp->nwords == 0)
203		{
204			if (fp->prefix)
205				memset(fp->outbuf, ' ', fp->prefix);
206			fp->outp = &fp->outbuf[fp->prefix];
207			while (*cp == ' ')
208				cp++;
209			n = (ep-cp);
210		}
211		memcpy(fp->outp, cp, n);
212		fp->outp += n;
213		fp->nwords++;
214	}
215	if (isoption(fp, 's') || *buf == 0)
216		outline(fp);
217	else if (fp->outp)
218	{
219		/*
220		 * two spaces at ends of sentences
221		 */
222
223		if (!isoption(fp, 'o') && strchr(".:!?", fp->outp[-1]))
224			*fp->outp++ = ' ';
225		if (!splice && !fp->retain && (!fp->quote || (fp->outp - fp->outbuf) < 2 || fp->outp[-2] != '\\' || fp->outp[-1] != 'n' && fp->outp[-1] != 't' && fp->outp[-1] != ' '))
226			*fp->outp++ = ' ';
227	}
228}
229
230static int
231dofmt(Fmt_t* fp)
232{
233	register int	c;
234	int		b;
235	int		x;
236	int		splice;
237	char*		cp;
238	char*		dp;
239	char*		ep;
240	char*		lp;
241	char*		tp;
242	char		buf[8192];
243
244	cp = 0;
245	while (cp || (cp = sfgetr(fp->in, '\n', 0)) && !(splice = 0) && (lp = cp + sfvalue(fp->in) - 1) || (cp = sfgetr(fp->in, '\n', SF_LASTR)) && (splice = 1) && (lp = cp + sfvalue(fp->in)))
246	{
247		if (isoption(fp, 'o'))
248		{
249			if (!isoption(fp, 'i'))
250			{
251				setoption(fp, 'i');
252				b = 0;
253				while (cp < lp)
254				{
255					if (*cp == ' ')
256						b += 1;
257					else if (*cp == '\t')
258						b += INDENT;
259					else
260						break;
261					cp++;
262				}
263				fp->indent = roundof(b, INDENT);
264			}
265			else
266				while (cp < lp && (*cp == ' ' || *cp == '\t'))
267					cp++;
268			if (!isoption(fp, 'q') && cp < lp)
269			{
270				setoption(fp, 'q');
271				if (*cp == '"')
272				{
273					ep = lp;
274					while (--ep > cp)
275						if (*ep == '"')
276						{
277							fp->quote = 1;
278							break;
279						}
280						else if (*ep != ' ' && *ep != '\t')
281							break;
282				}
283			}
284		}
285	again:
286		dp = buf;
287		ep = 0;
288		for (b = 1;; b = 0)
289		{
290			if (cp >= lp)
291			{
292				cp = 0;
293				break;
294			}
295			c = *cp++;
296			if (isoption(fp, 'o'))
297			{
298				if (c == '\\')
299				{
300					x = 0;
301					c = ' ';
302					cp--;
303					while (cp < lp)
304					{
305						if (*cp == '\\')
306						{
307							cp++;
308							if ((lp - cp) < 1)
309							{
310								c = '\\';
311								break;
312							}
313							if (*cp == 'n')
314							{
315								cp++;
316								c = '\n';
317								if ((lp - cp) > 2)
318								{
319									if (*cp == ']' || *cp == '@' && *(cp + 1) == '(')
320									{
321										*dp++ = '\\';
322										*dp++ = 'n';
323										c = *cp++;
324										break;
325									}
326									if (*cp == '\\' && *(cp + 1) == 'n')
327									{
328										cp += 2;
329										*dp++ = '\n';
330										break;
331									}
332								}
333							}
334							else if (*cp == 't' || *cp == ' ')
335							{
336								cp++;
337								x = 1;
338								c = ' ';
339							}
340							else
341							{
342								if (x && dp != buf && *(dp - 1) != ' ')
343									*dp++ = ' ';
344								*dp++ = '\\';
345								c = *cp++;
346								break;
347							}
348						}
349						else if (*cp == ' ' || *cp == '\t')
350						{
351							cp++;
352							c = ' ';
353							x = 1;
354						}
355						else
356						{
357							if (x && c != '\n' && dp != buf && *(dp - 1) != ' ')
358								*dp++ = ' ';
359							break;
360						}
361					}
362					if (c == '\n')
363					{
364						c = 0;
365						goto flush;
366					}
367					if (c == ' ' && (dp == buf || *(dp - 1) == ' '))
368						continue;
369				}
370				else if (c == '"')
371				{
372					if (b || cp >= lp)
373					{
374						if (fp->quote)
375							continue;
376						fp->section = 0;
377					}
378				}
379				else if (c == '\a')
380				{
381					*dp++ = '\\';
382					c = 'a';
383				}
384				else if (c == '\b')
385				{
386					*dp++ = '\\';
387					c = 'b';
388				}
389				else if (c == '\f')
390				{
391					*dp++ = '\\';
392					c = 'f';
393				}
394				else if (c == '\v')
395				{
396					*dp++ = '\\';
397					c = 'v';
398				}
399				else if (c == ']' && (cp >= lp || *cp != ':' && *cp != '#' && *cp != '!'))
400				{
401					if (cp < lp && *cp == ']')
402					{
403						cp++;
404						*dp++ = c;
405					}
406					else
407					{
408						fp->section = 1;
409						fp->retain = 0;
410					flush:
411						*dp++ = c;
412						*dp = 0;
413						split(fp, buf, 0);
414						outline(fp);
415						goto again;
416					}
417				}
418				else if (fp->section)
419				{
420					if (c == '[')
421					{
422						if (b)
423							fp->retain = 1;
424						else
425						{
426							cp--;
427							c = 0;
428							goto flush;
429						}
430						fp->section = 0;
431					}
432					else if (c == '{')
433					{
434						x = 1;
435						for (tp = cp; tp < lp; tp++)
436						{
437							if (*tp == '[' || *tp == '\n')
438								break;
439							if (*tp == ' ' || *tp == '\t' || *tp == '"')
440								continue;
441							if (*tp == '\\' && (lp - tp) > 1)
442							{
443								if (*++tp == 'n')
444									break;
445								if (*tp == 't' || *tp == '\n')
446									continue;
447							}
448							x = 0;
449							break;
450						}
451						if (x)
452						{
453							if (fp->endbuf > (fp->outbuf + fp->indent + 2*INDENT))
454								fp->nextdent = 2*INDENT;
455							goto flush;
456						}
457						else
458							fp->section = 0;
459					}
460					else if (c == '}')
461					{
462						if (fp->indent && (b || *(cp - 2) != 'f'))
463						{
464							if (b)
465							{
466								fp->indent -= 2*INDENT;
467								fp->endbuf += 2*INDENT;
468							}
469							else
470							{
471								cp--;
472								c = 0;
473							}
474							goto flush;
475						}
476						else
477							fp->section = 0;
478					}
479					else if (c == ' ' || c == '\t')
480						continue;
481					else
482						fp->section = 0;
483				}
484				else if (c == '?' && (cp >= lp || *cp != '?'))
485				{
486					if (fp->retain)
487					{
488						cp--;
489						while (cp < lp && *cp != ' ' && *cp != '\t' && *cp != ']' && dp < &buf[sizeof(buf)-3])
490							*dp++ = *cp++;
491						if (cp < lp && (*cp == ' ' || *cp == '\t'))
492							*dp++ = *cp++;
493						*dp = 0;
494						split(fp, buf, 0);
495						dp = buf;
496						ep = 0;
497						fp->retain = 0;
498						if (fp->outp >= fp->endbuf)
499							outline(fp);
500						continue;
501					}
502				}
503				else if (c == ' ' || c == '\t')
504					for (c = ' '; *cp == ' ' || *cp == '\t'; cp++);
505			}
506			else if (c == '\b')
507			{
508				if (dp > buf)
509				{
510					dp--;
511					if (ep)
512						ep--;
513				}
514				continue;
515			}
516			else if (c == '\t')
517			{
518				/*
519				 * expand tabs
520				 */
521
522				if (!ep)
523					ep = dp;
524				c = isoption(fp, 'o') ? 1 : TABSZ - (dp - buf) % TABSZ;
525				if (dp >= &buf[sizeof(buf) - c - 3])
526				{
527					cp--;
528					break;
529				}
530				while (c-- > 0)
531					*dp++ = ' ';
532				continue;
533			}
534			else if (!isprint(c))
535				continue;
536			if (dp >= &buf[sizeof(buf) - 3])
537			{
538				tp = dp;
539				while (--tp > buf)
540					if (isspace(*tp))
541					{
542						cp -= dp - tp;
543						dp = tp;
544						break;
545					}
546				ep = 0;
547				break;
548			}
549			if (c != ' ')
550				ep = 0;
551			else if (!ep)
552				ep = dp;
553			*dp++ = c;
554		}
555		if (ep)
556			*ep = 0;
557		else
558			*dp = 0;
559		split(fp, buf, splice);
560	}
561	return 0;
562}
563
564int
565b_fmt(int argc, char** argv, Shbltin_t* context)
566{
567	register int	n;
568	char*		cp;
569	Fmt_t		fmt;
570	char		outbuf[8 * 1024];
571
572	fmt.flags = 0;
573	fmt.out = sfstdout;
574	fmt.outbuf = outbuf;
575	fmt.outp = 0;
576	fmt.endbuf = &outbuf[72];
577	fmt.indent = 0;
578	fmt.nextdent = 0;
579	fmt.nwords = 0;
580	fmt.prefix = 0;
581	fmt.quote = 0;
582	fmt.retain = 0;
583	fmt.section = 1;
584	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
585	for (;;)
586	{
587		switch (n = optget(argv, usage))
588		{
589		case 'c':
590		case 'o':
591		case 's':
592		case 'u':
593			setoption(&fmt, n);
594			continue;
595		case 'w':
596			if (opt_info.num < TABSZ || opt_info.num>= sizeof(outbuf))
597				error(2, "width out of range");
598			fmt.endbuf = &outbuf[opt_info.num];
599			continue;
600		case ':':
601			error(2, "%s", opt_info.arg);
602			break;
603		case '?':
604			error(ERROR_usage(2), "%s", opt_info.arg);
605			break;
606		}
607		break;
608	}
609	argv += opt_info.index;
610	if (error_info.errors)
611		error(ERROR_usage(2), "%s", optusage(NiL));
612	if (isoption(&fmt, 'o'))
613		setoption(&fmt, 'c');
614	if (isoption(&fmt, 's'))
615		clroption(&fmt, 'u');
616	if (cp = *argv)
617		argv++;
618	do {
619		if (!cp || streq(cp, "-"))
620			fmt.in = sfstdin;
621		else if (!(fmt.in = sfopen(NiL, cp, "r")))
622		{
623			error(ERROR_system(0), "%s: cannot open", cp);
624			error_info.errors = 1;
625			continue;
626		}
627		dofmt(&fmt);
628		if (fmt.in != sfstdin)
629			sfclose(fmt.in);
630	} while (cp = *argv++);
631	outline(&fmt);
632	if (sfsync(sfstdout))
633		error(ERROR_system(0), "write error");
634	return error_info.errors != 0;
635}
636