1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1992-2010 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                  Common Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*            http://www.opensource.org/licenses/cpl1.0.txt             *
11*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                                                                      *
20***********************************************************************/
21#pragma prototyped
22/*
23 * uniq
24 *
25 * Written by David Korn
26 */
27
28static const char usage[] =
29"[-n?\n@(#)$Id: uniq (AT&T Research) 2009-11-28 $\n]"
30USAGE_LICENSE
31"[+NAME?uniq - Report or filter out repeated lines in a file]"
32"[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and "
33	"writes one copy of each input line on the output.  The second "
34	"and succeeding copies of the repeated adjacent lines are not "
35	"written.]"
36"[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
37	"to standard output.  If no \ainfile\a is given, or if the \ainfile\a "
38	"is \b-\b, \buniq\b reads from standard input with the start of "
39	"the file defined as the current offset.]"
40"[c:count?Output the number of times each line occurred  along with "
41	"the line.]"
42"[d:repeated|duplicates?Output the first of each duplicate line.]"
43"[D:all-repeated?Output all duplicate lines as a group with an empty "
44    "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
45    "{"
46        "[n:none?Do not delimit duplicate groups.]"
47        "[p:prepend?Prepend an empty line before each group.]"
48        "[s:separate?Separate each group with an empty line.]"
49    "}"
50"[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
51    "before checking for uniqueness. A field is the minimal string matching "
52    "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to "
53    "\b--skip-fields\b=\anumber\a.]"
54"[i:ignore-case?Ignore case in comparisons.]"
55"[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
56	"before checking for uniqueness.  If specified along with \b-f\b, "
57	"the first \achars\a after the first \afields\a are ignored.  If "
58	"the \achars\a specifies more characters than are on the line, "
59	"an empty string will be used for comparison. +\anumber\a is "
60	"equivalent to \b--skip-chars\b=\anumber\a.]"
61"[u:unique?Output unique lines.]"
62"[w:check-chars]#[chars?\achars\a is the number of characters to compare "
63	"after skipping any specified fields and characters.]"
64"\n"
65"\n[infile [outfile]]\n"
66"\n"
67"[+EXIT STATUS?]{"
68	"[+0?The input file was successfully processed.]"
69	"[+>0?An error occurred.]"
70"}"
71"[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
72;
73
74#include <cmd.h>
75
76#define C_FLAG	1
77#define D_FLAG	2
78#define U_FLAG	4
79
80#define CWIDTH	4
81#define MAXCNT	9999
82
83typedef int (*Compare_f)(const char*, const char*, size_t);
84
85static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
86{
87	register int n, f, outsize=0, mb = mbwide();
88	register char *cp, *ep, *mp, *bufp, *outp;
89	char *orecp, *sbufp=0, *outbuff;
90	int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
91	if(mode&C_FLAG)
92		cwidth = CWIDTH+1;
93	while(1)
94	{
95		if(bufp = sfgetr(fdin,'\n',0))
96			n = sfvalue(fdin);
97		else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
98		{
99			n = sfvalue(fdin);
100			bufp = memcpy(fmtbuf(n + 1), bufp, n);
101			bufp[n++] = '\n';
102		}
103		else
104			n = 0;
105		if (n)
106		{
107			cp = bufp;
108			ep = cp + n;
109			if (f = fields)
110				while (f-->0 && cp<ep) /* skip over fields */
111				{
112					while (cp<ep && *cp==' ' || *cp=='\t')
113						cp++;
114					while (cp<ep && *cp!=' ' && *cp!='\t')
115						cp++;
116				}
117			if (chars)
118			{
119				if (mb)
120					for (f = chars; f; f--)
121						mbchar(cp);
122				else
123					cp += chars;
124			}
125			if ((reclen = n - (cp - bufp)) <= 0)
126			{
127				reclen = 1;
128				cp = bufp + n - 1;
129			}
130			else if (width >= 0 && width < reclen)
131			{
132				if (mb)
133				{
134					reclen = 0;
135					mp = cp;
136					while (reclen < width && mp < ep)
137					{
138						reclen++;
139						mbchar(mp);
140					}
141					reclen = mp - cp;
142				}
143				else
144					reclen = width;
145			}
146		}
147		else
148			reclen = -2;
149		if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
150		{
151			count++;
152			if (!all)
153				continue;
154			next = count;
155		}
156		else
157		{
158			next = 0;
159			if(outsize>0)
160			{
161				if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
162				{
163					if(outp!=sbufp)
164						sfwrite(fdout,outp,0);
165				}
166				else
167				{
168					if(cwidth)
169					{
170						if(count<9)
171						{
172							f = 0;
173							while(f < CWIDTH-1)
174								outp[f++] = ' ';
175							outp[f++] = '0' + count + 1;
176							outp[f] = ' ';
177						}
178						else if(count<MAXCNT)
179						{
180							count++;
181							f = CWIDTH;
182							outp[f--] = ' ';
183							do
184							{
185								outp[f--] = '0' + (count % 10);
186							} while (count /= 10);
187							while (f >= 0)
188								outp[f--] = ' ';
189						}
190						else
191						{
192							outsize -= (CWIDTH+1);
193							if(outp!=sbufp)
194							{
195								if(!(sbufp=fmtbuf(outsize)))
196									return(1);
197								memcpy(sbufp,outp+CWIDTH+1,outsize);
198								sfwrite(fdout,outp,0);
199								outp = sbufp;
200							}
201							else
202								outp += CWIDTH+1;
203							sfprintf(fdout,"%4d ",count+1);
204						}
205					}
206					if(sfwrite(fdout,outp,outsize) != outsize)
207						return(1);
208				}
209			}
210		}
211		if(n==0)
212			break;
213		if(count = next)
214		{
215			if(sfwrite(fdout,outp,outsize) != outsize)
216				return(1);
217			if(*all >= 0)
218				*all = 1;
219			sep = 0;
220		}
221		else
222			sep = all && *all > 0;
223		/* save current record */
224		if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
225			return(1);
226		outp = outbuff;
227		if(outsize < n+cwidth+sep)
228		{
229			/* no room in outp, clear lock and use side buffer */
230			sfwrite(fdout,outp,0);
231			if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
232				return(1);
233		}
234		else
235			outsize = n+cwidth+sep;
236		memcpy(outp+cwidth+sep,bufp,n);
237		if(sep)
238			outp[cwidth] = '\n';
239		oreclen = reclen;
240		orecp = outp+cwidth+sep + (cp-bufp);
241	}
242	return(0);
243}
244
245int
246b_uniq(int argc, char** argv, void* context)
247{
248	register int n, mode=0;
249	register char *cp;
250	int fields=0, chars=0, width=-1;
251	Sfio_t *fpin, *fpout;
252	int* all = 0;
253	int sep;
254	Compare_f compare = (Compare_f)memcmp;
255
256	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
257	while (n = optget(argv, usage)) switch (n)
258	{
259	    case 'c':
260		mode |= C_FLAG;
261		break;
262	    case 'd':
263		mode |= D_FLAG;
264		break;
265	    case 'D':
266		mode |= D_FLAG;
267		switch ((int)opt_info.num)
268		{
269		case 'p':
270			sep = 1;
271			break;
272		case 's':
273			sep = 0;
274			break;
275		default:
276			sep = -1;
277			break;
278		}
279		all = &sep;
280		break;
281	    case 'i':
282		compare = (Compare_f)strncasecmp;
283		break;
284	    case 'u':
285		mode |= U_FLAG;
286		break;
287	    case 'f':
288		if(*opt_info.option=='-')
289			fields = opt_info.num;
290		else
291			chars = opt_info.num;
292		break;
293	    case 's':
294		chars = opt_info.num;
295		break;
296	    case 'w':
297		width = opt_info.num;
298		break;
299	    case ':':
300		error(2, "%s", opt_info.arg);
301		break;
302	    case '?':
303		error(ERROR_usage(2), "%s", opt_info.arg);
304		break;
305	}
306	argv += opt_info.index;
307	if(all && (mode&C_FLAG))
308		error(2, "-c and -D are mutually exclusive");
309	if(error_info.errors)
310		error(ERROR_usage(2), "%s", optusage(NiL));
311	if((cp = *argv) && (argv++,!streq(cp,"-")))
312	{
313		if(!(fpin = sfopen(NiL,cp,"r")))
314			error(ERROR_system(1),"%s: cannot open",cp);
315	}
316	else
317		fpin = sfstdin;
318	if(cp = *argv)
319	{
320		argv++;
321		if(!(fpout = sfopen(NiL,cp,"w")))
322			error(ERROR_system(1),"%s: cannot create",cp);
323	}
324	else
325		fpout = sfstdout;
326	if(*argv)
327	{
328		error(2, "too many arguments");
329		error(ERROR_usage(2), "%s", optusage(NiL));
330	}
331	error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
332	if(fpin!=sfstdin)
333		sfclose(fpin);
334	if(fpout!=sfstdout)
335		sfclose(fpout);
336	return(error_info.errors);
337}
338
339