1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1992-2012 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                 Eclipse Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*          http://www.eclipse.org/org/documents/epl-v10.html           *
11*         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                                                                      *
20***********************************************************************/
21#pragma prototyped
22/*
23 * David Korn
24 * AT&T Bell Laboratories
25 *
26 * cut fields or columns from fields from a file
27 */
28
29static const char usage[] =
30"[-?\n@(#)$Id: cut (AT&T Research) 2010-08-11 $\n]"
31USAGE_LICENSE
32"[+NAME?cut - cut out selected columns or fields of each line of a file]"
33"[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34	"from one or more files, contatenating them on standard output.]"
35"[+?The option argument \alist\a is a comma-separated or blank-separated "
36	"list of positive numbers and ranges.  Ranges can be of three "
37	"forms.  The first is two positive integers separated by a hyphen "
38	"(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39	"\ahigh\a.  The second is a positive number preceded by a hyphen "
40	"(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41	"\ahigh\a.  The last is a positive number followed by a hyphen "
42	"(\alow\a\b-\b), which represents all fields from \alow\a to the "
43	"last field, inclusive.  Elements in the \alist\a can be repeated, "
44	"can overlap, and can appear in any order.  The order of the "
45	"output is that of the input.]"
46"[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47"[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48        "cuts from standard input.   The start of the file is defined "
49        "as the current offset.]"
50"[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51"[c:characters]:[list?\bcut\b based on a list of character counts.]"
52"[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53	"to \adelim\a.  The default is the \btab\b character.]"
54"[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55	"character specified with the \b-d\b optiion.]"
56"[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57"[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58	"records of length \areclen\a when used with the \b-b\b or \b-c\b "
59	"option.]"
60"[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61	"when used with the \b-f\b option.  By default, lines with no "
62	"delimiters will be passsed in untouched.]"
63"[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64	"the \b-f\b option is set to \aldelim\a.  The default is the "
65	"\bnewline\b character.]"
66"[N!:newline?Output new-lines at end of each record when used "
67	"with the \b-b\b or \b-c\b option.]"
68"\n"
69"\n[file ...]\n"
70"\n"
71"[+EXIT STATUS?]{"
72	"[+0?All files processed successfully.]"
73	"[+>0?One or more files failed to open or could not be read.]"
74"}"
75"[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
76;
77
78#include <cmd.h>
79#include <ctype.h>
80
81typedef struct Delim_s
82{
83	char*		str;
84	int		len;
85	int		chr;
86} Delim_t;
87
88typedef struct Cut_s
89{
90	int		mb;
91	int		eob;
92	int		cflag;
93	int		nosplit;
94	int		sflag;
95	int		nlflag;
96	int		reclen;
97	Delim_t		wdelim;
98	Delim_t		ldelim;
99	unsigned char	space[UCHAR_MAX+1];
100	int		list[2];	/* NOTE: must be last member */
101} Cut_t;
102
103#define HUGE		INT_MAX
104#define BLOCK		8*1024
105#define C_BYTES		1
106#define C_CHARS		2
107#define C_FIELDS	4
108#define C_SUPRESS	8
109#define C_NOSPLIT	16
110#define C_NONEWLINE	32
111
112#define SP_LINE		1
113#define SP_WORD		2
114#define SP_WIDE		3
115
116/*
117 * compare the first of an array of integers
118 */
119
120static int
121mycomp(register const void* a, register const void* b)
122{
123	if (*((int*)a) < *((int*)b))
124		return -1;
125	if (*((int*)a) > *((int*)b))
126		return 1;
127	return 0;
128}
129
130static Cut_t*
131cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
132{
133	register int*	lp;
134	register int	c;
135	register int	n = 0;
136	register int	range = 0;
137	register char*	cp = str;
138	Cut_t*		cut;
139
140	if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
141		error(ERROR_exit(1), "out of space");
142	if (cut->mb = mbwide())
143	{
144		memset(cut->space, 0, sizeof(cut->space) / 2);
145		memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
146	}
147	else
148		memset(cut->space, 0, sizeof(cut->space));
149	cut->wdelim = *wdelim;
150	if (wdelim->len == 1)
151		cut->space[wdelim->chr] = SP_WORD;
152	cut->ldelim = *ldelim;
153	cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
154	cut->space[cut->eob] = SP_LINE;
155	cut->cflag = (mode&C_CHARS) && cut->mb;
156	cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
157	cut->sflag = (mode&C_SUPRESS) != 0;
158	cut->nlflag = (mode&C_NONEWLINE) != 0;
159	cut->reclen = reclen;
160	lp = cut->list;
161	for (;;)
162		switch(c = *cp++)
163		{
164		case ' ':
165		case '\t':
166			while(*cp==' ' || *cp=='\t')
167				cp++;
168			/*FALLTHROUGH*/
169		case 0:
170		case ',':
171			if(range)
172			{
173				--range;
174				if((n = (n ? (n-range) : (HUGE-1))) < 0)
175					error(ERROR_exit(1),"invalid range for c/f option");
176				*lp++ = range;
177				*lp++ = n;
178			}
179			else
180			{
181				*lp++ = --n;
182				*lp++ = 1;
183			}
184			if(c==0)
185			{
186				register int *dp;
187				*lp = HUGE;
188				n = 1 + (lp-cut->list)/2;
189				qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
190				/* eliminate overlapping regions */
191				for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
192				{
193					if(lp[0] <= range)
194					{
195						if(lp[1]==HUGE)
196						{
197							dp[-1] = HUGE;
198							break;
199						}
200						if((c = lp[0]+lp[1]-range)>0)
201						{
202							range += c;
203							dp[-1] += c;
204						}
205					}
206					else
207					{
208						range = *dp++ = lp[0];
209						if(lp[1]==HUGE)
210						{
211							*dp++ = HUGE;
212							break;
213						}
214						range += (*dp++ = lp[1]);
215					}
216				}
217				*dp = HUGE;
218				lp = cut->list;
219				/* convert ranges into gaps */
220				for(n=0; *lp!=HUGE; lp+=2)
221				{
222					c = *lp;
223					*lp -= n;
224					n = c+lp[1];
225				}
226				return cut;
227			}
228			n = range = 0;
229			break;
230
231		case '-':
232			if(range)
233				error(ERROR_exit(1),"bad list for c/f option");
234			range = n?n:1;
235			n = 0;
236			break;
237
238		default:
239			if(!isdigit(c))
240				error(ERROR_exit(1),"bad list for c/f option");
241			n = 10*n + (c-'0');
242			break;
243		}
244	/* NOTREACHED */
245}
246
247/*
248 * cut each line of file <fdin> and put results to <fdout> using list <list>
249 */
250
251static void
252cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
253{
254	register int		c;
255	register int		len;
256	register int		ncol = 0;
257	register const int*	lp = cut->list;
258	register char*		bp;
259	register int		skip; /* non-zero for don't copy */
260	int			must;
261	const char*		xx;
262
263	for (;;)
264	{
265		if (len = cut->reclen)
266			bp = sfreserve(fdin, len, -1);
267		else
268			bp = sfgetr(fdin, '\n', 0);
269		if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
270			break;
271		len = sfvalue(fdin);
272		xx = 0;
273		if (!(ncol = skip  = *(lp = cut->list)))
274			ncol = *++lp;
275		must = 1;
276		do
277		{
278			if (cut->nosplit)
279			{
280				register const char*	s = bp;
281				register int		w = len < ncol ? len : ncol;
282				register int		z;
283
284				while (w > 0)
285				{
286					if (!(*s & 0x80))
287						z = 1;
288					else if ((z = mbnsize(s, w)) <= 0)
289					{
290						if (s == bp && xx)
291						{
292							w += s - xx;
293							bp = (char*)(s = xx);
294							xx = 0;
295							continue;
296						}
297						xx = s;
298						if (skip)
299							s += w;
300						w = 0;
301						break;
302					}
303					s += z;
304					w -= z;
305				}
306				c = s - bp;
307				ncol = !w && ncol >= len;
308			}
309			else if (cut->cflag)
310			{
311				register const char*	s = bp;
312				register int		w = len;
313				register int		z;
314
315				while (w > 0 && ncol > 0)
316				{
317					ncol--;
318					if (!(*s & 0x80) || (z = mbnsize(s, w)) <= 0)
319						z = 1;
320					s += z;
321					w -= z;
322
323				}
324				c = s - bp;
325				ncol = !w && (ncol || !skip);
326			}
327			else
328			{
329				if ((c = ncol) > len)
330					c = len;
331				else if (c == len && !skip)
332					ncol++;
333				ncol -= c;
334			}
335			if (!skip && c)
336			{
337				if (sfwrite(fdout, (char*)bp, c) < 0)
338					return;
339				must = 0;
340			}
341			bp += c;
342			if (ncol)
343				break;
344			len -= c;
345			ncol = *++lp;
346			skip = !skip;
347		} while (ncol != HUGE);
348		if (!cut->nlflag && (skip || must || cut->reclen))
349		{
350			if (cut->ldelim.len > 1)
351				sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
352			else
353				sfputc(fdout, cut->ldelim.chr);
354		}
355	}
356}
357
358/*
359 * cut each line of file <fdin> and put results to <fdout> using list <list>
360 * stream <fdin> must be line buffered
361 */
362
363static void
364cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
365{
366	register unsigned char *sp = cut->space;
367	register unsigned char *cp;
368	register unsigned char *wp;
369	register int c, nfields;
370	register const int *lp = cut->list;
371	register unsigned char *copy;
372	register int nodelim, empty, inword=0;
373	register unsigned char *ep;
374	unsigned char *bp, *first;
375	int lastchar;
376	wchar_t w;
377	Sfio_t *fdtmp = 0;
378	long offset = 0;
379	unsigned char mb[8];
380	/* process each buffer */
381	while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
382	{
383		cp = bp;
384		ep = cp + --c;
385		if((lastchar = cp[c]) != cut->eob)
386			*ep = cut->eob;
387		/* process each line in the buffer */
388		while (cp <= ep)
389		{
390			first = cp;
391			if (!inword)
392			{
393				nodelim = empty = 1;
394				copy = cp;
395				if (nfields = *(lp = cut->list))
396					copy = 0;
397				else
398					nfields = *++lp;
399			}
400			else if (copy)
401				copy = cp;
402			inword = 0;
403			do
404			{
405				/* skip over non-delimiter characters */
406				if (cut->mb)
407					for (;;)
408					{
409						switch (c = sp[*(unsigned char*)cp++])
410						{
411						case 0:
412							continue;
413						case SP_WIDE:
414							wp = --cp;
415							while ((c = mb2wc(w, cp, ep - cp)) <= 0)
416							{
417								/* mb char possibly spanning buffer boundary -- fun stuff */
418								if ((ep - cp) < mbmax())
419								{
420									int	i;
421									int	j;
422									int	k;
423
424									if (lastchar != cut->eob)
425									{
426										*ep = lastchar;
427										if ((c = mb2wc(w, cp, ep - cp)) > 0)
428											break;
429									}
430									if (copy)
431									{
432										empty = 0;
433										if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
434											goto failed;
435									}
436									for (i = 0; i <= (ep - cp); i++)
437										mb[i] = cp[i];
438									if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
439										goto failed;
440									cp = bp;
441									ep = cp + --c;
442									if ((lastchar = cp[c]) != cut->eob)
443										*ep = cut->eob;
444									j = i;
445									k = 0;
446									while (j < mbmax())
447										mb[j++] = cp[k++];
448									if ((c = mb2wc(w, (char*)mb, j)) <= 0)
449									{
450										c = i;
451										w = 0;
452									}
453									first = bp = cp += c - i;
454									if (copy)
455									{
456										copy = bp;
457										if (w == cut->ldelim.chr)
458											lastchar = cut->ldelim.chr;
459										else if (w != cut->wdelim.chr)
460										{
461											empty = 0;
462											if (sfwrite(fdout, (char*)mb, c) < 0)
463												goto failed;
464										}
465									}
466									c = 0;
467								}
468								else
469								{
470									w = *cp;
471									c = 1;
472								}
473								break;
474							}
475							cp += c;
476							c = w;
477							if (c == cut->wdelim.chr)
478							{
479								c = SP_WORD;
480								break;
481							}
482							if (c == cut->ldelim.chr)
483							{
484								c = SP_LINE;
485								break;
486							}
487							continue;
488						default:
489							wp = cp - 1;
490							break;
491						}
492						break;
493					}
494				else
495				{
496					while (!(c = sp[*cp++]));
497					wp = cp - 1;
498				}
499				/* check for end-of-line */
500				if (c == SP_LINE)
501				{
502					if (cp <= ep)
503						break;
504					if (lastchar == cut->ldelim.chr)
505						break;
506					/* restore cut->last character */
507					if (lastchar != cut->eob)
508						*ep = lastchar;
509					inword++;
510					if (!sp[lastchar])
511						break;
512				}
513				nodelim = 0;
514				if (--nfields > 0)
515					continue;
516				nfields = *++lp;
517				if (copy)
518				{
519					empty = 0;
520					if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
521						goto failed;
522					copy = 0;
523				}
524				else
525					/* set to delimiter unless the first field */
526					copy = empty ? cp : wp;
527			} while (!inword);
528			if (!inword)
529			{
530				if (!copy)
531				{
532					if (nodelim)
533					{
534						if (!cut->sflag)
535						{
536							if (offset)
537							{
538								sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
539								sfmove(fdtmp,fdout,offset,-1);
540							}
541							copy = first;
542						}
543					}
544					else
545						sfputc(fdout,'\n');
546				}
547				if (offset)
548					sfseek(fdtmp,offset=0,SEEK_SET);
549			}
550			if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
551				goto failed;
552		}
553		/* see whether to save in tmp file */
554		if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
555		{
556			/* copy line to tmpfile in case no fields */
557			if(!fdtmp)
558				fdtmp = sftmp(BLOCK);
559			sfwrite(fdtmp,(char*)first,c);
560			offset +=c;
561		}
562	}
563 failed:
564	if(fdtmp)
565		sfclose(fdtmp);
566}
567
568int
569b_cut(int argc, char** argv, Shbltin_t* context)
570{
571	register char*		cp = 0;
572	register Sfio_t*	fp;
573	char*			s;
574	int			n;
575	Cut_t*			cut;
576	int			mode = 0;
577	Delim_t			wdelim;
578	Delim_t			ldelim;
579	size_t			reclen = 0;
580
581	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
582	wdelim.chr = '\t';
583	ldelim.chr = '\n';
584	wdelim.len = ldelim.len = 1;
585	for (;;)
586	{
587		switch (optget(argv, usage))
588		{
589		case 0:
590			break;
591		case 'b':
592		case 'c':
593			if(mode&C_FIELDS)
594			{
595				error(2, "f option already specified");
596				continue;
597			}
598			cp = opt_info.arg;
599			if(opt_info.option[1]=='b')
600				mode |= C_BYTES;
601			else
602				mode |= C_CHARS;
603			continue;
604		case 'D':
605			ldelim.str = opt_info.arg;
606			if (mbwide())
607			{
608				s = opt_info.arg;
609				ldelim.chr = mbchar(s);
610				if ((n = s - opt_info.arg) > 1)
611				{
612					ldelim.len = n;
613					continue;
614				}
615			}
616			ldelim.chr = *(unsigned char*)opt_info.arg;
617			ldelim.len = 1;
618			continue;
619		case 'd':
620			wdelim.str = opt_info.arg;
621			if (mbwide())
622			{
623				s = opt_info.arg;
624				wdelim.chr = mbchar(s);
625				if ((n = s - opt_info.arg) > 1)
626				{
627					wdelim.len = n;
628					continue;
629				}
630			}
631			wdelim.chr = *(unsigned char*)opt_info.arg;
632			wdelim.len = 1;
633			continue;
634		case 'f':
635			if(mode&(C_CHARS|C_BYTES))
636			{
637				error(2, "c option already specified");
638				continue;
639			}
640			cp = opt_info.arg;
641			mode |= C_FIELDS;
642			continue;
643		case 'n':
644			mode |= C_NOSPLIT;
645			continue;
646		case 'N':
647			mode |= C_NONEWLINE;
648			continue;
649		case 'R':
650		case 'r':
651			if(opt_info.num>0)
652				reclen = opt_info.num;
653			continue;
654		case 's':
655			mode |= C_SUPRESS;
656			continue;
657		case ':':
658			error(2, "%s", opt_info.arg);
659			break;
660		case '?':
661			error(ERROR_usage(2), "%s", opt_info.arg);
662			break;
663		}
664		break;
665	}
666	argv += opt_info.index;
667	if (error_info.errors)
668		error(ERROR_usage(2), "%s",optusage(NiL));
669	if(!cp)
670	{
671		error(2, "b, c or f option must be specified");
672		error(ERROR_usage(2), "%s", optusage(NiL));
673	}
674	if(!*cp)
675		error(3, "non-empty b, c or f option must be specified");
676	if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
677		error(3, "s option requires f option");
678	cut = cutinit(mode, cp, &wdelim, &ldelim, reclen);
679	if(cp = *argv)
680		argv++;
681	do
682	{
683		if(!cp || streq(cp,"-"))
684			fp = sfstdin;
685		else if(!(fp = sfopen(NiL,cp,"r")))
686		{
687			error(ERROR_system(0),"%s: cannot open",cp);
688			continue;
689		}
690		if(mode&C_FIELDS)
691			cutfields(cut,fp,sfstdout);
692		else
693			cutcols(cut,fp,sfstdout);
694		if(fp!=sfstdin)
695			sfclose(fp);
696	} while(cp = *argv++);
697	if (sfsync(sfstdout))
698		error(ERROR_system(0), "write error");
699	return error_info.errors != 0;
700}
701