1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1992-2011 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                  Common Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*            http://www.opensource.org/licenses/cpl1.0.txt             *
11*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                                                                      *
20***********************************************************************/
21#pragma prototyped
22/*
23 * David Korn
24 * AT&T Bell Laboratories
25 *
26 * cut fields or columns from fields from a file
27 */
28
29static const char usage[] =
30"[-?\n@(#)$Id: cut (AT&T Research) 2010-08-11 $\n]"
31USAGE_LICENSE
32"[+NAME?cut - cut out selected columns or fields of each line of a file]"
33"[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34	"from one or more files, contatenating them on standard output.]"
35"[+?The option argument \alist\a is a comma-separated or blank-separated "
36	"list of positive numbers and ranges.  Ranges can be of three "
37	"forms.  The first is two positive integers separated by a hyphen "
38	"(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39	"\ahigh\a.  The second is a positive number preceded by a hyphen "
40	"(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41	"\ahigh\a.  The last is a positive number followed by a hyphen "
42	"(\alow\a\b-\b), which represents all fields from \alow\a to the "
43	"last field, inclusive.  Elements in the \alist\a can be repeated, "
44	"can overlap, and can appear in any order.  The order of the "
45	"output is that of the input.]"
46"[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47"[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48        "cuts from standard input.   The start of the file is defined "
49        "as the current offset.]"
50"[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51"[c:characters]:[list?\bcut\b based on a list of character counts.]"
52"[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53	"to \adelim\a.  The default is the \btab\b character.]"
54"[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55	"character specified with the \b-d\b optiion.]"
56"[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57"[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58	"records of length \areclen\a when used with the \b-b\b or \b-c\b "
59	"option.]"
60"[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61	"when used with the \b-f\b option.  By default, lines with no "
62	"delimiters will be passsed in untouched.]"
63"[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64	"the \b-f\b option is set to \aldelim\a.  The default is the "
65	"\bnewline\b character.]"
66"[N!:newline?Output new-lines at end of each record when used "
67	"with the \b-b\b or \b-c\b option.]"
68"\n"
69"\n[file ...]\n"
70"\n"
71"[+EXIT STATUS?]{"
72	"[+0?All files processed successfully.]"
73	"[+>0?One or more files failed to open or could not be read.]"
74"}"
75"[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
76;
77
78#include <cmd.h>
79#include <ctype.h>
80
81typedef struct Delim_s
82{
83	char*		str;
84	int		len;
85	int		chr;
86} Delim_t;
87
88typedef struct Cut_s
89{
90	int		mb;
91	int		eob;
92	int		cflag;
93	int		nosplit;
94	int		sflag;
95	int		nlflag;
96	int		reclen;
97	Delim_t		wdelim;
98	Delim_t		ldelim;
99	unsigned char	space[UCHAR_MAX+1];
100	int		list[2];	/* NOTE: must be last member */
101} Cut_t;
102
103#define HUGE		INT_MAX
104#define BLOCK		8*1024
105#define C_BYTES		1
106#define C_CHARS		2
107#define C_FIELDS	4
108#define C_SUPRESS	8
109#define C_NOSPLIT	16
110#define C_NONEWLINE	32
111
112#define SP_LINE		1
113#define SP_WORD		2
114#define SP_WIDE		3
115
116/*
117 * compare the first of an array of integers
118 */
119
120static int
121mycomp(register const void* a, register const void* b)
122{
123	if (*((int*)a) < *((int*)b))
124		return -1;
125	if (*((int*)a) > *((int*)b))
126		return 1;
127	return 0;
128}
129
130static Cut_t*
131cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
132{
133	register int*	lp;
134	register int	c;
135	register int	n = 0;
136	register int	range = 0;
137	register char*	cp = str;
138	Cut_t*		cut;
139
140	if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
141		error(ERROR_exit(1), "out of space");
142	if (cut->mb = mbwide())
143	{
144		memset(cut->space, 0, sizeof(cut->space) / 2);
145		memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
146	}
147	else
148		memset(cut->space, 0, sizeof(cut->space));
149	cut->wdelim = *wdelim;
150	if (wdelim->len == 1)
151		cut->space[wdelim->chr] = SP_WORD;
152	cut->ldelim = *ldelim;
153	cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
154	cut->space[cut->eob] = SP_LINE;
155	cut->cflag = (mode&C_CHARS) && cut->mb;
156	cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
157	cut->sflag = (mode&C_SUPRESS) != 0;
158	cut->nlflag = (mode&C_NONEWLINE) != 0;
159	cut->reclen = reclen;
160	lp = cut->list;
161	for (;;)
162		switch(c = *cp++)
163		{
164		case ' ':
165		case '\t':
166			while(*cp==' ' || *cp=='\t')
167				cp++;
168			/*FALLTHROUGH*/
169		case 0:
170		case ',':
171			if(range)
172			{
173				--range;
174				if((n = (n ? (n-range) : (HUGE-1))) < 0)
175					error(ERROR_exit(1),"invalid range for c/f option");
176				*lp++ = range;
177				*lp++ = n;
178			}
179			else
180			{
181				*lp++ = --n;
182				*lp++ = 1;
183			}
184			if(c==0)
185			{
186				register int *dp;
187				*lp = HUGE;
188				n = 1 + (lp-cut->list)/2;
189				qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
190				/* eliminate overlapping regions */
191				for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
192				{
193					if(lp[0] <= range)
194					{
195						if(lp[1]==HUGE)
196						{
197							dp[-1] = HUGE;
198							break;
199						}
200						if((c = lp[0]+lp[1]-range)>0)
201						{
202							range += c;
203							dp[-1] += c;
204						}
205					}
206					else
207					{
208						range = *dp++ = lp[0];
209						if(lp[1]==HUGE)
210						{
211							*dp++ = HUGE;
212							break;
213						}
214						range += (*dp++ = lp[1]);
215					}
216				}
217				*dp = HUGE;
218				lp = cut->list;
219				/* convert ranges into gaps */
220				for(n=0; *lp!=HUGE; lp+=2)
221				{
222					c = *lp;
223					*lp -= n;
224					n = c+lp[1];
225				}
226				return cut;
227			}
228			n = range = 0;
229			break;
230
231		case '-':
232			if(range)
233				error(ERROR_exit(1),"bad list for c/f option");
234			range = n?n:1;
235			n = 0;
236			break;
237
238		default:
239			if(!isdigit(c))
240				error(ERROR_exit(1),"bad list for c/f option");
241			n = 10*n + (c-'0');
242			break;
243		}
244	/* NOTREACHED */
245}
246
247/*
248 * cut each line of file <fdin> and put results to <fdout> using list <list>
249 */
250
251static void
252cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
253{
254	register int		c;
255	register int		len;
256	register int		ncol = 0;
257	register const int*	lp = cut->list;
258	register char*		bp;
259	register int		skip; /* non-zero for don't copy */
260	int			must;
261	char*			ep;
262	const char*		xx;
263
264	for (;;)
265	{
266		if (len = cut->reclen)
267			bp = sfreserve(fdin, len, -1);
268		else
269			bp = sfgetr(fdin, '\n', 0);
270		if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
271			break;
272		len = sfvalue(fdin);
273		ep = bp + len;
274		xx = 0;
275		if (!(ncol = skip  = *(lp = cut->list)))
276			ncol = *++lp;
277		must = 1;
278		do
279		{
280			if (cut->nosplit)
281			{
282				register const char*	s = bp;
283				register int		w = len < ncol ? len : ncol;
284				register int		z;
285
286				while (w > 0)
287				{
288					if (!(*s & 0x80))
289						z = 1;
290					else if ((z = mbnsize(s, w)) <= 0)
291					{
292						if (s == bp && xx)
293						{
294							w += s - xx;
295							bp = (char*)(s = xx);
296							xx = 0;
297							continue;
298						}
299						xx = s;
300						if (skip)
301							s += w;
302						w = 0;
303						break;
304					}
305					s += z;
306					w -= z;
307				}
308				c = s - bp;
309				ncol = !w && ncol >= len;
310			}
311			else if (cut->cflag)
312			{
313				register const char*	s = bp;
314				register int		w = len;
315				register int		z;
316
317				while (w > 0 && ncol > 0)
318				{
319					ncol--;
320					if (!(*s & 0x80) || (z = mbnsize(s, w)) <= 0)
321						z = 1;
322					s += z;
323					w -= z;
324
325				}
326				c = s - bp;
327				ncol = !w && (ncol || !skip);
328			}
329			else
330			{
331				if ((c = ncol) > len)
332					c = len;
333				else if (c == len && !skip)
334					ncol++;
335				ncol -= c;
336			}
337			if (!skip && c)
338			{
339				if (sfwrite(fdout, (char*)bp, c) < 0)
340					return;
341				must = 0;
342			}
343			bp += c;
344			if (ncol)
345				break;
346			len -= c;
347			ncol = *++lp;
348			skip = !skip;
349		} while (ncol != HUGE);
350		if (!cut->nlflag && (skip || must || cut->reclen))
351		{
352			if (cut->ldelim.len > 1)
353				sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
354			else
355				sfputc(fdout, cut->ldelim.chr);
356		}
357	}
358}
359
360/*
361 * cut each line of file <fdin> and put results to <fdout> using list <list>
362 * stream <fdin> must be line buffered
363 */
364
365static void
366cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
367{
368	register unsigned char *sp = cut->space;
369	register unsigned char *cp;
370	register unsigned char *wp;
371	register int c, nfields;
372	register const int *lp = cut->list;
373	register unsigned char *copy;
374	register int nodelim, empty, inword=0;
375	register unsigned char *ep;
376	unsigned char *bp, *first;
377	int lastchar;
378	wchar_t w;
379	Sfio_t *fdtmp = 0;
380	long offset = 0;
381	unsigned char mb[8];
382	/* process each buffer */
383	while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
384	{
385		cp = bp;
386		ep = cp + --c;
387		if((lastchar = cp[c]) != cut->eob)
388			*ep = cut->eob;
389		/* process each line in the buffer */
390		while (cp <= ep)
391		{
392			first = cp;
393			if (!inword)
394			{
395				nodelim = empty = 1;
396				copy = cp;
397				if (nfields = *(lp = cut->list))
398					copy = 0;
399				else
400					nfields = *++lp;
401			}
402			else if (copy)
403				copy = cp;
404			inword = 0;
405			do
406			{
407				/* skip over non-delimiter characters */
408				if (cut->mb)
409					for (;;)
410					{
411						switch (c = sp[*(unsigned char*)cp++])
412						{
413						case 0:
414							continue;
415						case SP_WIDE:
416							wp = --cp;
417							while ((c = mb2wc(w, cp, ep - cp)) <= 0)
418							{
419								/* mb char possibly spanning buffer boundary -- fun stuff */
420								if ((ep - cp) < mbmax())
421								{
422									int	i;
423									int	j;
424									int	k;
425
426									if (lastchar != cut->eob)
427									{
428										*ep = lastchar;
429										if ((c = mb2wc(w, cp, ep - cp)) > 0)
430											break;
431									}
432									if (copy)
433									{
434										empty = 0;
435										if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
436											goto failed;
437									}
438									for (i = 0; i <= (ep - cp); i++)
439										mb[i] = cp[i];
440									if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
441										goto failed;
442									cp = bp;
443									ep = cp + --c;
444									if ((lastchar = cp[c]) != cut->eob)
445										*ep = cut->eob;
446									j = i;
447									k = 0;
448									while (j < mbmax())
449										mb[j++] = cp[k++];
450									if ((c = mb2wc(w, (char*)mb, j)) <= 0)
451									{
452										c = i;
453										w = 0;
454									}
455									first = bp = cp += c - i;
456									if (copy)
457									{
458										copy = bp;
459										if (w == cut->ldelim.chr)
460											lastchar = cut->ldelim.chr;
461										else if (w != cut->wdelim.chr)
462										{
463											empty = 0;
464											if (sfwrite(fdout, (char*)mb, c) < 0)
465												goto failed;
466										}
467									}
468									c = 0;
469								}
470								else
471								{
472									w = *cp;
473									c = 1;
474								}
475								break;
476							}
477							cp += c;
478							c = w;
479							if (c == cut->wdelim.chr)
480							{
481								c = SP_WORD;
482								break;
483							}
484							if (c == cut->ldelim.chr)
485							{
486								c = SP_LINE;
487								break;
488							}
489							continue;
490						default:
491							wp = cp - 1;
492							break;
493						}
494						break;
495					}
496				else
497				{
498					while (!(c = sp[*cp++]));
499					wp = cp - 1;
500				}
501				/* check for end-of-line */
502				if (c == SP_LINE)
503				{
504					if (cp <= ep)
505						break;
506					if (lastchar == cut->ldelim.chr)
507						break;
508					/* restore cut->last character */
509					if (lastchar != cut->eob)
510						*ep = lastchar;
511					inword++;
512					if (!sp[lastchar])
513						break;
514				}
515				nodelim = 0;
516				if (--nfields > 0)
517					continue;
518				nfields = *++lp;
519				if (copy)
520				{
521					empty = 0;
522					if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
523						goto failed;
524					copy = 0;
525				}
526				else
527					/* set to delimiter unless the first field */
528					copy = empty ? cp : wp;
529			} while (!inword);
530			if (!inword)
531			{
532				if (!copy)
533				{
534					if (nodelim)
535					{
536						if (!cut->sflag)
537						{
538							if (offset)
539							{
540								sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
541								sfmove(fdtmp,fdout,offset,-1);
542							}
543							copy = first;
544						}
545					}
546					else
547						sfputc(fdout,'\n');
548				}
549				if (offset)
550					sfseek(fdtmp,offset=0,SEEK_SET);
551			}
552			if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
553				goto failed;
554		}
555		/* see whether to save in tmp file */
556		if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
557		{
558			/* copy line to tmpfile in case no fields */
559			if(!fdtmp)
560				fdtmp = sftmp(BLOCK);
561			sfwrite(fdtmp,(char*)first,c);
562			offset +=c;
563		}
564	}
565 failed:
566	if(fdtmp)
567		sfclose(fdtmp);
568}
569
570int
571b_cut(int argc, char** argv, void* context)
572{
573	register char*		cp = 0;
574	register Sfio_t*	fp;
575	char*			s;
576	int			n;
577	Cut_t*			cut;
578	int			mode = 0;
579	Delim_t			wdelim;
580	Delim_t			ldelim;
581	size_t			reclen = 0;
582
583	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
584	wdelim.chr = '\t';
585	ldelim.chr = '\n';
586	wdelim.len = ldelim.len = 1;
587	for (;;)
588	{
589		switch (optget(argv, usage))
590		{
591		case 0:
592			break;
593		case 'b':
594		case 'c':
595			if(mode&C_FIELDS)
596			{
597				error(2, "f option already specified");
598				continue;
599			}
600			cp = opt_info.arg;
601			if(opt_info.option[1]=='b')
602				mode |= C_BYTES;
603			else
604				mode |= C_CHARS;
605			continue;
606		case 'D':
607			ldelim.str = opt_info.arg;
608			if (mbwide())
609			{
610				s = opt_info.arg;
611				ldelim.chr = mbchar(s);
612				if ((n = s - opt_info.arg) > 1)
613				{
614					ldelim.len = n;
615					continue;
616				}
617			}
618			ldelim.chr = *(unsigned char*)opt_info.arg;
619			ldelim.len = 1;
620			continue;
621		case 'd':
622			wdelim.str = opt_info.arg;
623			if (mbwide())
624			{
625				s = opt_info.arg;
626				wdelim.chr = mbchar(s);
627				if ((n = s - opt_info.arg) > 1)
628				{
629					wdelim.len = n;
630					continue;
631				}
632			}
633			wdelim.chr = *(unsigned char*)opt_info.arg;
634			wdelim.len = 1;
635			continue;
636		case 'f':
637			if(mode&(C_CHARS|C_BYTES))
638			{
639				error(2, "c option already specified");
640				continue;
641			}
642			cp = opt_info.arg;
643			mode |= C_FIELDS;
644			continue;
645		case 'n':
646			mode |= C_NOSPLIT;
647			continue;
648		case 'N':
649			mode |= C_NONEWLINE;
650			continue;
651		case 'R':
652		case 'r':
653			if(opt_info.num>0)
654				reclen = opt_info.num;
655			continue;
656		case 's':
657			mode |= C_SUPRESS;
658			continue;
659		case ':':
660			error(2, "%s", opt_info.arg);
661			break;
662		case '?':
663			error(ERROR_usage(2), "%s", opt_info.arg);
664			break;
665		}
666		break;
667	}
668	argv += opt_info.index;
669	if (error_info.errors)
670		error(ERROR_usage(2), "%s",optusage(NiL));
671	if(!cp)
672	{
673		error(2, "b, c or f option must be specified");
674		error(ERROR_usage(2), "%s", optusage(NiL));
675	}
676	if(!*cp)
677		error(3, "non-empty b, c or f option must be specified");
678	if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
679		error(3, "s option requires f option");
680	cut = cutinit(mode, cp, &wdelim, &ldelim, reclen);
681	if(cp = *argv)
682		argv++;
683	do
684	{
685		if(!cp || streq(cp,"-"))
686			fp = sfstdin;
687		else if(!(fp = sfopen(NiL,cp,"r")))
688		{
689			error(ERROR_system(0),"%s: cannot open",cp);
690			continue;
691		}
692		if(mode&C_FIELDS)
693			cutfields(cut,fp,sfstdout);
694		else
695			cutcols(cut,fp,sfstdout);
696		if(fp!=sfstdin)
697			sfclose(fp);
698	} while(cp = *argv++);
699	if (sfsync(sfstdout))
700		error(ERROR_system(0), "write error");
701	return error_info.errors != 0;
702}
703