1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1992-2010 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                  Common Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*            http://www.opensource.org/licenses/cpl1.0.txt             *
11*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                                                                      *
20***********************************************************************/
21#pragma prototyped
22/*
23 * David Korn
24 * AT&T Bell Laboratories
25 *
26 * cut fields or columns from fields from a file
27 */
28
29static const char usage[] =
30"[-?\n@(#)$Id: cut (AT&T Research) 2009-12-04 $\n]"
31USAGE_LICENSE
32"[+NAME?cut - cut out selected columns or fields of each line of a file]"
33"[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34	"from one or more files, contatenating them on standard output.]"
35"[+?The option argument \alist\a is a comma-separated or blank-separated "
36	"list of positive numbers and ranges.  Ranges can be of three "
37	"forms.  The first is two positive integers separated by a hyphen "
38	"(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39	"\ahigh\a.  The second is a positive number preceded by a hyphen "
40	"(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41	"\ahigh\a.  The last is a positive number followed by a hyphen "
42	"(\alow\a\b-\b), which represents all fields from \alow\a to the "
43	"last field, inclusive.  Elements in the \alist\a can be repeated, "
44	"can overlap, and can appear in any order.  The order of the "
45	"output is that of the input.]"
46"[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47"[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48        "cuts from standard input.   The start of the file is defined "
49        "as the current offset.]"
50"[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51"[c:characters]:[list?\bcut\b based on a list of character counts.]"
52"[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53	"to \adelim\a.  The default is the \btab\b character.]"
54"[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55	"character specified with the \b-d\b optiion.]"
56"[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57"[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58	"records of length \areclen\a when used with the \b-b\b or \b-c\b "
59	"option.]"
60"[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61	"when used with the \b-f\b option.  By default, lines with no "
62	"delimiters will be passsed in untouched.]"
63"[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64	"the \b-f\b option is set to \aldelim\a.  The default is the "
65	"\bnewline\b character.]"
66"[N!:newline?Output new-lines at end of each record when used "
67	"with the \b-b\b or \b-c\b option.]"
68"\n"
69"\n[file ...]\n"
70"\n"
71"[+EXIT STATUS?]{"
72	"[+0?All files processed successfully.]"
73	"[+>0?One or more files failed to open or could not be read.]"
74"}"
75"[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
76;
77
78#include <cmd.h>
79#include <ctype.h>
80
81typedef struct Delim_s
82{
83	char*		str;
84	int		len;
85	int		chr;
86} Delim_t;
87
88typedef struct Cut_s
89{
90	int		mb;
91	int		eob;
92	int		cflag;
93	int		nosplit;
94	int		sflag;
95	int		nlflag;
96	int		reclen;
97	Delim_t		wdelim;
98	Delim_t		ldelim;
99	unsigned char	space[UCHAR_MAX+1];
100	int		list[2];	/* NOTE: must be last member */
101} Cut_t;
102
103#define HUGE		INT_MAX
104#define BLOCK		8*1024
105#define C_BYTES		1
106#define C_CHARS		2
107#define C_FIELDS	4
108#define C_SUPRESS	8
109#define C_NOSPLIT	16
110#define C_NONEWLINE	32
111
112#define SP_LINE		1
113#define SP_WORD		2
114#define SP_WIDE		3
115
116#define mb2wc(w,p,n)	(*ast.mb_towc)(&w,(char*)p,n)
117
118/*
119 * compare the first of an array of integers
120 */
121
122static int
123mycomp(register const void* a, register const void* b)
124{
125	if (*((int*)a) < *((int*)b))
126		return -1;
127	if (*((int*)a) > *((int*)b))
128		return 1;
129	return 0;
130}
131
132static Cut_t*
133cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
134{
135	register int*	lp;
136	register int	c;
137	register int	n = 0;
138	register int	range = 0;
139	register char*	cp = str;
140	Cut_t*		cut;
141
142	if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
143		error(ERROR_exit(1), "out of space");
144	if (cut->mb = mbwide())
145	{
146		memset(cut->space, 0, sizeof(cut->space) / 2);
147		memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
148	}
149	else
150		memset(cut->space, 0, sizeof(cut->space));
151	cut->wdelim = *wdelim;
152	if (wdelim->len == 1)
153		cut->space[wdelim->chr] = SP_WORD;
154	cut->ldelim = *ldelim;
155	cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
156	cut->space[cut->eob] = SP_LINE;
157	cut->cflag = (mode&C_CHARS) && cut->mb;
158	cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
159	cut->sflag = (mode&C_SUPRESS) != 0;
160	cut->nlflag = (mode&C_NONEWLINE) != 0;
161	cut->reclen = reclen;
162	lp = cut->list;
163	for (;;)
164		switch(c = *cp++)
165		{
166		case ' ':
167		case '\t':
168			while(*cp==' ' || *cp=='\t')
169				cp++;
170			/*FALLTHROUGH*/
171		case 0:
172		case ',':
173			if(range)
174			{
175				--range;
176				if((n = (n ? (n-range) : (HUGE-1))) < 0)
177					error(ERROR_exit(1),"invalid range for c/f option");
178				*lp++ = range;
179				*lp++ = n;
180			}
181			else
182			{
183				*lp++ = --n;
184				*lp++ = 1;
185			}
186			if(c==0)
187			{
188				register int *dp;
189				*lp = HUGE;
190				n = 1 + (lp-cut->list)/2;
191				qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
192				/* eliminate overlapping regions */
193				for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
194				{
195					if(lp[0] <= range)
196					{
197						if(lp[1]==HUGE)
198						{
199							dp[-1] = HUGE;
200							break;
201						}
202						if((c = lp[0]+lp[1]-range)>0)
203						{
204							range += c;
205							dp[-1] += c;
206						}
207					}
208					else
209					{
210						range = *dp++ = lp[0];
211						if(lp[1]==HUGE)
212						{
213							*dp++ = HUGE;
214							break;
215						}
216						range += (*dp++ = lp[1]);
217					}
218				}
219				*dp = HUGE;
220				lp = cut->list;
221				/* convert ranges into gaps */
222				for(n=0; *lp!=HUGE; lp+=2)
223				{
224					c = *lp;
225					*lp -= n;
226					n = c+lp[1];
227				}
228				return cut;
229			}
230			n = range = 0;
231			break;
232
233		case '-':
234			if(range)
235				error(ERROR_exit(1),"bad list for c/f option");
236			range = n?n:1;
237			n = 0;
238			break;
239
240		default:
241			if(!isdigit(c))
242				error(ERROR_exit(1),"bad list for c/f option");
243			n = 10*n + (c-'0');
244			break;
245		}
246	/* NOTREACHED */
247}
248
249/*
250 * cut each line of file <fdin> and put results to <fdout> using list <list>
251 */
252
253static void
254cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
255{
256	register int		c;
257	register int		len;
258	register int		ncol = 0;
259	register const int*	lp = cut->list;
260	register char*		bp;
261	register int		skip; /* non-zero for don't copy */
262	int			must;
263	char*			ep;
264	const char*		xx;
265
266	for (;;)
267	{
268		if (len = cut->reclen)
269			bp = sfreserve(fdin, len, -1);
270		else
271			bp = sfgetr(fdin, '\n', 0);
272		if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
273			break;
274		len = sfvalue(fdin);
275		ep = bp + len;
276		xx = 0;
277		if (!(ncol = skip  = *(lp = cut->list)))
278			ncol = *++lp;
279		must = 1;
280		do
281		{
282			if (cut->nosplit)
283			{
284				register const char*	s = bp;
285				register int		w = len < ncol ? len : ncol;
286				register int		z;
287
288				while (w > 0)
289				{
290					if (!(*s & 0x80))
291						z = 1;
292					else if ((z = mblen(s, w)) <= 0)
293					{
294						if (s == bp && xx)
295						{
296							w += s - xx;
297							bp = (char*)(s = xx);
298							xx = 0;
299							continue;
300						}
301						xx = s;
302						if (skip)
303							s += w;
304						w = 0;
305						break;
306					}
307					s += z;
308					w -= z;
309				}
310				c = s - bp;
311				ncol = !w && ncol >= len;
312			}
313			else if (cut->cflag)
314			{
315				register const char*	s = bp;
316				register int		w = len;
317				register int		z;
318
319				while (w > 0 && ncol > 0)
320				{
321					ncol--;
322					if (!(*s & 0x80) || (z = mblen(s, w)) <= 0)
323						z = 1;
324					s += z;
325					w -= z;
326
327				}
328				c = s - bp;
329				ncol = !w && (ncol || !skip);
330			}
331			else
332			{
333				if ((c = ncol) > len)
334					c = len;
335				else if (c == len && !skip)
336					ncol++;
337				ncol -= c;
338			}
339			if (!skip && c)
340			{
341				if (sfwrite(fdout, (char*)bp, c) < 0)
342					return;
343				must = 0;
344			}
345			bp += c;
346			if (ncol)
347				break;
348			len -= c;
349			ncol = *++lp;
350			skip = !skip;
351		} while (ncol != HUGE);
352		if (!cut->nlflag && (skip || must || cut->reclen))
353		{
354			if (cut->ldelim.len > 1)
355				sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
356			else
357				sfputc(fdout, cut->ldelim.chr);
358		}
359	}
360}
361
362/*
363 * cut each line of file <fdin> and put results to <fdout> using list <list>
364 * stream <fdin> must be line buffered
365 */
366
367static void
368cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
369{
370	register unsigned char *sp = cut->space;
371	register unsigned char *cp;
372	register unsigned char *wp;
373	register int c, nfields;
374	register const int *lp = cut->list;
375	register unsigned char *copy;
376	register int nodelim, empty, inword=0;
377	register unsigned char *ep;
378	unsigned char *bp, *first;
379	int lastchar;
380	wchar_t w;
381	Sfio_t *fdtmp = 0;
382	long offset = 0;
383	unsigned char mb[8];
384	/* process each buffer */
385	while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
386	{
387		cp = bp;
388		ep = cp + --c;
389		if((lastchar = cp[c]) != cut->eob)
390			*ep = cut->eob;
391		/* process each line in the buffer */
392		while (cp <= ep)
393		{
394			first = cp;
395			if (!inword)
396			{
397				nodelim = empty = 1;
398				copy = cp;
399				if (nfields = *(lp = cut->list))
400					copy = 0;
401				else
402					nfields = *++lp;
403			}
404			else if (copy)
405				copy = cp;
406			inword = 0;
407			do
408			{
409				/* skip over non-delimiter characters */
410				if (cut->mb)
411					for (;;)
412					{
413						switch (c = sp[*(unsigned char*)cp++])
414						{
415						case 0:
416							continue;
417						case SP_WIDE:
418							wp = --cp;
419							while ((c = mb2wc(w, cp, ep - cp)) <= 0)
420							{
421								/* mb char possibly spanning buffer boundary -- fun stuff */
422								if ((ep - cp) < mbmax())
423								{
424									int	i;
425									int	j;
426									int	k;
427
428									if (lastchar != cut->eob)
429									{
430										*ep = lastchar;
431										if ((c = mb2wc(w, cp, ep - cp)) > 0)
432											break;
433									}
434									if (copy)
435									{
436										empty = 0;
437										if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
438											goto failed;
439									}
440									for (i = 0; i <= (ep - cp); i++)
441										mb[i] = cp[i];
442									if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
443										goto failed;
444									cp = bp;
445									ep = cp + --c;
446									if ((lastchar = cp[c]) != cut->eob)
447										*ep = cut->eob;
448									j = i;
449									k = 0;
450									while (j < mbmax())
451										mb[j++] = cp[k++];
452									if ((c = mb2wc(w, (char*)mb, j)) <= 0)
453									{
454										c = i;
455										w = 0;
456									}
457									first = bp = cp += c - i;
458									if (copy)
459									{
460										copy = bp;
461										if (w == cut->ldelim.chr)
462											lastchar = cut->ldelim.chr;
463										else if (w != cut->wdelim.chr)
464										{
465											empty = 0;
466											if (sfwrite(fdout, (char*)mb, c) < 0)
467												goto failed;
468										}
469									}
470									c = 0;
471								}
472								else
473								{
474									w = *cp;
475									c = 1;
476								}
477								break;
478							}
479							cp += c;
480							c = w;
481							if (c == cut->wdelim.chr)
482							{
483								c = SP_WORD;
484								break;
485							}
486							if (c == cut->ldelim.chr)
487							{
488								c = SP_LINE;
489								break;
490							}
491							continue;
492						default:
493							wp = cp - 1;
494							break;
495						}
496						break;
497					}
498				else
499				{
500					while (!(c = sp[*cp++]));
501					wp = cp - 1;
502				}
503				/* check for end-of-line */
504				if (c == SP_LINE)
505				{
506					if (cp <= ep)
507						break;
508					if (lastchar == cut->ldelim.chr)
509						break;
510					/* restore cut->last character */
511					if (lastchar != cut->eob)
512						*ep = lastchar;
513					inword++;
514					if (!sp[lastchar])
515						break;
516				}
517				nodelim = 0;
518				if (--nfields > 0)
519					continue;
520				nfields = *++lp;
521				if (copy)
522				{
523					empty = 0;
524					if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
525						goto failed;
526					copy = 0;
527				}
528				else
529					/* set to delimiter unless the first field */
530					copy = empty ? cp : wp;
531			} while (!inword);
532			if (!inword)
533			{
534				if (!copy)
535				{
536					if (nodelim)
537					{
538						if (!cut->sflag)
539						{
540							if (offset)
541							{
542								sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
543								sfmove(fdtmp,fdout,offset,-1);
544							}
545							copy = first;
546						}
547					}
548					else
549						sfputc(fdout,'\n');
550				}
551				if (offset)
552					sfseek(fdtmp,offset=0,SEEK_SET);
553			}
554			if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
555				goto failed;
556		}
557		/* see whether to save in tmp file */
558		if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
559		{
560			/* copy line to tmpfile in case no fields */
561			if(!fdtmp)
562				fdtmp = sftmp(BLOCK);
563			sfwrite(fdtmp,(char*)first,c);
564			offset +=c;
565		}
566	}
567 failed:
568	if(fdtmp)
569		sfclose(fdtmp);
570}
571
572int
573b_cut(int argc, char** argv, void* context)
574{
575	register char*		cp = 0;
576	register Sfio_t*	fp;
577	char*			s;
578	int			n;
579	Cut_t*			cut;
580	int			mode = 0;
581	Delim_t			wdelim;
582	Delim_t			ldelim;
583	size_t			reclen = 0;
584
585	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
586	wdelim.chr = '\t';
587	ldelim.chr = '\n';
588	wdelim.len = ldelim.len = 1;
589	for (;;)
590	{
591		switch (n = optget(argv, usage))
592		{
593		case 0:
594			break;
595		case 'b':
596		case 'c':
597			if(mode&C_FIELDS)
598			{
599				error(2, "f option already specified");
600				continue;
601			}
602			cp = opt_info.arg;
603			if(n=='b')
604				mode |= C_BYTES;
605			else
606				mode |= C_CHARS;
607			continue;
608		case 'D':
609			ldelim.str = opt_info.arg;
610			if (mbwide())
611			{
612				s = opt_info.arg;
613				ldelim.chr = mbchar(s);
614				if ((n = s - opt_info.arg) > 1)
615				{
616					ldelim.len = n;
617					continue;
618				}
619			}
620			ldelim.chr = *(unsigned char*)opt_info.arg;
621			ldelim.len = 1;
622			continue;
623		case 'd':
624			wdelim.str = opt_info.arg;
625			if (mbwide())
626			{
627				s = opt_info.arg;
628				wdelim.chr = mbchar(s);
629				if ((n = s - opt_info.arg) > 1)
630				{
631					wdelim.len = n;
632					continue;
633				}
634			}
635			wdelim.chr = *(unsigned char*)opt_info.arg;
636			wdelim.len = 1;
637			continue;
638		case 'f':
639			if(mode&(C_CHARS|C_BYTES))
640			{
641				error(2, "c option already specified");
642				continue;
643			}
644			cp = opt_info.arg;
645			mode |= C_FIELDS;
646			continue;
647		case 'n':
648			mode |= C_NOSPLIT;
649			continue;
650		case 'N':
651			mode |= C_NONEWLINE;
652			continue;
653		case 'R':
654		case 'r':
655			if(opt_info.num>0)
656				reclen = opt_info.num;
657			continue;
658		case 's':
659			mode |= C_SUPRESS;
660			continue;
661		case ':':
662			error(2, "%s", opt_info.arg);
663			break;
664		case '?':
665			error(ERROR_usage(2), "%s", opt_info.arg);
666			break;
667		}
668		break;
669	}
670	argv += opt_info.index;
671	if (error_info.errors)
672		error(ERROR_usage(2), "%s",optusage(NiL));
673	if(!cp)
674	{
675		error(2, "b, c or f option must be specified");
676		error(ERROR_usage(2), "%s", optusage(NiL));
677	}
678	if(!*cp)
679		error(3, "non-empty b, c or f option must be specified");
680	if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
681		error(3, "s option requires f option");
682	cut = cutinit(mode, cp, &wdelim, &ldelim, reclen);
683	if(cp = *argv)
684		argv++;
685	do
686	{
687		if(!cp || streq(cp,"-"))
688			fp = sfstdin;
689		else if(!(fp = sfopen(NiL,cp,"r")))
690		{
691			error(ERROR_system(0),"%s: cannot open",cp);
692			continue;
693		}
694		if(mode&C_FIELDS)
695			cutfields(cut,fp,sfstdout);
696		else
697			cutcols(cut,fp,sfstdout);
698		if(fp!=sfstdin)
699			sfclose(fp);
700	} while(cp = *argv++);
701	if (sfsync(sfstdout))
702		error(ERROR_system(0), "write error");
703	return error_info.errors != 0;
704}
705