join.c revision 1590
1/*-
2 * Copyright (c) 1991, 1993, 1994
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Steve Hayman of the Computer Science Department, Indiana University,
7 * Michiro Hikida and David Goodenough.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38#ifndef lint
39static char copyright[] =
40"@(#) Copyright (c) 1991, 1993, 1994\n\
41	The Regents of the University of California.  All rights reserved.\n";
42#endif /* not lint */
43
44#ifndef lint
45static char sccsid[] = "@(#)join.c	8.3 (Berkeley) 4/16/94";
46#endif /* not lint */
47
48#include <sys/param.h>
49
50#include <ctype.h>
51#include <err.h>
52#include <errno.h>
53#include <stdio.h>
54#include <stdlib.h>
55#include <string.h>
56
57/*
58 * There's a structure per input file which encapsulates the state of the
59 * file.  We repeatedly read lines from each file until we've read in all
60 * the consecutive lines from the file with a common join field.  Then we
61 * compare the set of lines with an equivalent set from the other file.
62 */
63typedef struct {
64	char *line;		/* line */
65	u_long linealloc;	/* line allocated count */
66	char **fields;		/* line field(s) */
67	u_long fieldcnt;	/* line field(s) count */
68	u_long fieldalloc;	/* line field(s) allocated count */
69} LINE;
70
71typedef struct {
72	FILE *fp;		/* file descriptor */
73	u_long joinf;		/* join field (-1, -2, -j) */
74	int unpair;		/* output unpairable lines (-a) */
75	int number;		/* 1 for file 1, 2 for file 2 */
76
77	LINE *set;		/* set of lines with same field */
78	int pushbool;		/* if pushback is set */
79	u_long pushback;	/* line on the stack */
80	u_long setcnt;		/* set count */
81	u_long setalloc;	/* set allocated count */
82} INPUT;
83INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, },
84      input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, };
85
86typedef struct {
87	u_long	filenum;	/* file number */
88	u_long	fieldno;	/* field number */
89} OLIST;
90OLIST *olist;			/* output field list */
91u_long olistcnt;		/* output field list count */
92u_long olistalloc;		/* output field allocated count */
93
94int joinout = 1;		/* show lines with matched join fields (-v) */
95int needsep;			/* need separator character */
96int spans = 1;			/* span multiple delimiters (-t) */
97char *empty;			/* empty field replacement string (-e) */
98char *tabchar = " \t";		/* delimiter characters (-t) */
99
100int  cmp __P((LINE *, u_long, LINE *, u_long));
101void fieldarg __P((char *));
102void joinlines __P((INPUT *, INPUT *));
103void obsolete __P((char **));
104void outfield __P((LINE *, u_long, int));
105void outoneline __P((INPUT *, LINE *));
106void outtwoline __P((INPUT *, LINE *, INPUT *, LINE *));
107void slurp __P((INPUT *));
108void usage __P((void));
109
110int
111main(argc, argv)
112	int argc;
113	char *argv[];
114{
115	INPUT *F1, *F2;
116	int aflag, ch, cval, vflag;
117	char *end;
118
119	F1 = &input1;
120	F2 = &input2;
121
122	aflag = vflag = 0;
123	obsolete(argv);
124	while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != EOF) {
125		switch (ch) {
126		case '\01':		/* See comment in obsolete(). */
127			aflag = 1;
128			F1->unpair = F2->unpair = 1;
129			break;
130		case '1':
131			if ((F1->joinf = strtol(optarg, &end, 10)) < 1)
132				errx(1, "-1 option field number less than 1");
133			if (*end)
134				errx(1, "illegal field number -- %s", optarg);
135			--F1->joinf;
136			break;
137		case '2':
138			if ((F2->joinf = strtol(optarg, &end, 10)) < 1)
139				errx(1, "-2 option field number less than 1");
140			if (*end)
141				errx(1, "illegal field number -- %s", optarg);
142			--F2->joinf;
143			break;
144		case 'a':
145			aflag = 1;
146			switch(strtol(optarg, &end, 10)) {
147			case 1:
148				F1->unpair = 1;
149				break;
150			case 2:
151				F2->unpair = 1;
152				break;
153			default:
154				errx(1, "-a option file number not 1 or 2");
155				break;
156			}
157			if (*end)
158				errx(1, "illegal file number -- %s", optarg);
159			break;
160		case 'e':
161			empty = optarg;
162			break;
163		case 'j':
164			if ((F1->joinf = F2->joinf =
165			    strtol(optarg, &end, 10)) < 1)
166				errx(1, "-j option field number less than 1");
167			if (*end)
168				errx(1, "illegal field number -- %s", optarg);
169			--F1->joinf;
170			--F2->joinf;
171			break;
172		case 'o':
173			fieldarg(optarg);
174			break;
175		case 't':
176			spans = 0;
177			if (strlen(tabchar = optarg) != 1)
178				errx(1, "illegal tab character specification");
179			break;
180		case 'v':
181			vflag = 1;
182			joinout = 0;
183			switch (strtol(optarg, &end, 10)) {
184			case 1:
185				F1->unpair = 1;
186				break;
187			case 2:
188				F2->unpair = 1;
189				break;
190			default:
191				errx(1, "-v option file number not 1 or 2");
192				break;
193			}
194			if (*end)
195				errx(1, "illegal file number -- %s", optarg);
196			break;
197		case '?':
198		default:
199			usage();
200		}
201	}
202	argc -= optind;
203	argv += optind;
204
205	if (aflag && vflag)
206		errx(1, "the -a and -v options are mutually exclusive");
207
208	if (argc != 2)
209		usage();
210
211	/* Open the files; "-" means stdin. */
212	if (!strcmp(*argv, "-"))
213		F1->fp = stdin;
214	else if ((F1->fp = fopen(*argv, "r")) == NULL)
215		err(1, "%s", *argv);
216	++argv;
217	if (!strcmp(*argv, "-"))
218		F2->fp = stdin;
219	else if ((F2->fp = fopen(*argv, "r")) == NULL)
220		err(1, "%s", *argv);
221	if (F1->fp == stdin && F2->fp == stdin)
222		errx(1, "only one input file may be stdin");
223
224	slurp(F1);
225	slurp(F2);
226	while (F1->setcnt && F2->setcnt) {
227		cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
228		if (cval == 0) {
229			/* Oh joy, oh rapture, oh beauty divine! */
230			if (joinout)
231				joinlines(F1, F2);
232			slurp(F1);
233			slurp(F2);
234		} else if (cval < 0) {
235			/* File 1 takes the lead... */
236			if (F1->unpair)
237				joinlines(F1, NULL);
238			slurp(F1);
239		} else {
240			/* File 2 takes the lead... */
241			if (F2->unpair)
242				joinlines(F2, NULL);
243			slurp(F2);
244		}
245	}
246
247	/*
248	 * Now that one of the files is used up, optionally output any
249	 * remaining lines from the other file.
250	 */
251	if (F1->unpair)
252		while (F1->setcnt) {
253			joinlines(F1, NULL);
254			slurp(F1);
255		}
256	if (F2->unpair)
257		while (F2->setcnt) {
258			joinlines(F2, NULL);
259			slurp(F2);
260		}
261	exit(0);
262}
263
264void
265slurp(F)
266	INPUT *F;
267{
268	LINE *lp, *lastlp, tmp;
269	size_t len;
270	int cnt;
271	char *bp, *fieldp;
272
273	/*
274	 * Read all of the lines from an input file that have the same
275	 * join field.
276	 */
277	F->setcnt = 0;
278	for (lastlp = NULL;; ++F->setcnt, lastlp = lp) {
279		/*
280		 * If we're out of space to hold line structures, allocate
281		 * more.  Initialize the structure so that we know that this
282		 * is new space.
283		 */
284		if (F->setcnt == F->setalloc) {
285			cnt = F->setalloc;
286			F->setalloc += 50;
287			if ((F->set = realloc(F->set,
288			    F->setalloc * sizeof(LINE))) == NULL)
289				err(1, NULL);
290			memset(F->set + cnt, 0, 50 * sizeof(LINE));
291		}
292
293		/*
294		 * Get any pushed back line, else get the next line.  Allocate
295		 * space as necessary.  If taking the line from the stack swap
296		 * the two structures so that we don't lose space allocated to
297		 * either structure.  This could be avoided by doing another
298		 * level of indirection, but it's probably okay as is.
299		 * but it's probably okay as is.
300		 */
301		lp = &F->set[F->setcnt];
302		if (F->pushbool) {
303			tmp = F->set[F->setcnt];
304			F->set[F->setcnt] = F->set[F->pushback];
305			F->set[F->pushback] = tmp;
306			F->pushbool = 0;
307			continue;
308		}
309		if ((bp = fgetln(F->fp, &len)) == NULL)
310			return;
311		if (lp->linealloc <= len + 1) {
312			lp->linealloc += MAX(100, len + 1);
313			if ((lp->line =
314			    realloc(lp->line, lp->linealloc)) == NULL)
315				err(1, NULL);
316		}
317		memmove(lp->line, bp, len);
318
319		/* Replace trailing newline, if it exists. */
320		if (bp[len - 1] == '\n')
321			lp->line[len - 1] = '\0';
322		else
323			lp->line[len] = '\0';
324		bp = lp->line;
325
326		/* Split the line into fields, allocate space as necessary. */
327		lp->fieldcnt = 0;
328		while ((fieldp = strsep(&bp, tabchar)) != NULL) {
329			if (spans && *fieldp == '\0')
330				continue;
331			if (lp->fieldcnt == lp->fieldalloc) {
332				lp->fieldalloc += 50;
333				if ((lp->fields = realloc(lp->fields,
334				    lp->fieldalloc * sizeof(char *))) == NULL)
335					err(1, NULL);
336			}
337			lp->fields[lp->fieldcnt++] = fieldp;
338		}
339
340		/* See if the join field value has changed. */
341		if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
342			F->pushbool = 1;
343			F->pushback = F->setcnt;
344			break;
345		}
346	}
347}
348
349int
350cmp(lp1, fieldno1, lp2, fieldno2)
351	LINE *lp1, *lp2;
352	u_long fieldno1, fieldno2;
353{
354	if (lp1->fieldcnt < fieldno1)
355		return (lp2->fieldcnt < fieldno2 ? 0 : 1);
356	if (lp2->fieldcnt < fieldno2)
357		return (-1);
358	return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2]));
359}
360
361void
362joinlines(F1, F2)
363	INPUT *F1, *F2;
364{
365	int cnt1, cnt2;
366
367	/*
368	 * Output the results of a join comparison.  The output may be from
369	 * either file 1 or file 2 (in which case the first argument is the
370	 * file from which to output) or from both.
371	 */
372	if (F2 == NULL) {
373		for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
374			outoneline(F1, &F1->set[cnt1]);
375		return;
376	}
377	for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
378		for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
379			outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
380}
381
382void
383outoneline(F, lp)
384	INPUT *F;
385	LINE *lp;
386{
387	int cnt;
388
389	/*
390	 * Output a single line from one of the files, according to the
391	 * join rules.  This happens when we are writing unmatched single
392	 * lines.  Output empty fields in the right places.
393	 */
394	if (olist)
395		for (cnt = 0; cnt < olistcnt; ++cnt) {
396			if (olist[cnt].filenum == F->number)
397				outfield(lp, olist[cnt].fieldno, 0);
398			else
399				outfield(lp, 0, 1);
400		}
401	else
402		for (cnt = 0; cnt < lp->fieldcnt; ++cnt)
403			outfield(lp, cnt, 0);
404	(void)printf("\n");
405	if (ferror(stdout))
406		err(1, "stdout");
407	needsep = 0;
408}
409
410void
411outtwoline(F1, lp1, F2, lp2)
412	INPUT *F1, *F2;
413	LINE *lp1, *lp2;
414{
415	int cnt;
416
417	/* Output a pair of lines according to the join list (if any). */
418	if (olist)
419		for (cnt = 0; cnt < olistcnt; ++cnt)
420			if (olist[cnt].filenum == 1)
421				outfield(lp1, olist[cnt].fieldno, 0);
422			else /* if (olist[cnt].filenum == 2) */
423				outfield(lp2, olist[cnt].fieldno, 0);
424	else {
425		/*
426		 * Output the join field, then the remaining fields from F1
427		 * and F2.
428		 */
429		outfield(lp1, F1->joinf, 0);
430		for (cnt = 0; cnt < lp1->fieldcnt; ++cnt)
431			if (F1->joinf != cnt)
432				outfield(lp1, cnt, 0);
433		for (cnt = 0; cnt < lp2->fieldcnt; ++cnt)
434			if (F2->joinf != cnt)
435				outfield(lp2, cnt, 0);
436	}
437	(void)printf("\n");
438	if (ferror(stdout))
439		err(1, "stdout");
440	needsep = 0;
441}
442
443void
444outfield(lp, fieldno, out_empty)
445	LINE *lp;
446	u_long fieldno;
447	int out_empty;
448{
449	if (needsep++)
450		(void)printf("%c", *tabchar);
451	if (!ferror(stdout))
452		if (lp->fieldcnt < fieldno || out_empty) {
453			if (empty != NULL)
454				(void)printf("%s", empty);
455		} else {
456			if (*lp->fields[fieldno] == '\0')
457				return;
458			(void)printf("%s", lp->fields[fieldno]);
459		}
460	if (ferror(stdout))
461		err(1, "stdout");
462}
463
464/*
465 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
466 * fields.
467 */
468void
469fieldarg(option)
470	char *option;
471{
472	u_long fieldno;
473	char *end, *token;
474
475	while ((token = strsep(&option, " \t")) != NULL) {
476		if (*token == '\0')
477			continue;
478		if (token[0] != '1' && token[0] != '2' || token[1] != '.')
479			errx(1, "malformed -o option field");
480		fieldno = strtol(token + 2, &end, 10);
481		if (*end)
482			errx(1, "malformed -o option field");
483		if (fieldno == 0)
484			errx(1, "field numbers are 1 based");
485		if (olistcnt == olistalloc) {
486			olistalloc += 50;
487			if ((olist = realloc(olist,
488			    olistalloc * sizeof(OLIST))) == NULL)
489				err(1, NULL);
490		}
491		olist[olistcnt].filenum = token[0] - '0';
492		olist[olistcnt].fieldno = fieldno - 1;
493		++olistcnt;
494	}
495}
496
497void
498obsolete(argv)
499	char **argv;
500{
501	int len;
502	char **p, *ap, *t;
503
504	while ((ap = *++argv) != NULL) {
505		/* Return if "--". */
506		if (ap[0] == '-' && ap[1] == '-')
507			return;
508		switch (ap[1]) {
509		case 'a':
510			/*
511			 * The original join allowed "-a", which meant the
512			 * same as -a1 plus -a2.  POSIX 1003.2, Draft 11.2
513			 * only specifies this as "-a 1" and "a -2", so we
514			 * have to use another option flag, one that is
515			 * unlikely to ever be used or accidentally entered
516			 * on the command line.  (Well, we could reallocate
517			 * the argv array, but that hardly seems worthwhile.)
518			 */
519			if (ap[2] == '\0')
520				ap[1] = '\01';
521			break;
522		case 'j':
523			/*
524			 * The original join allowed "-j[12] arg" and "-j arg".
525			 * Convert the former to "-[12] arg".  Don't convert
526			 * the latter since getopt(3) can handle it.
527			 */
528			switch(ap[2]) {
529			case '1':
530				if (ap[3] != '\0')
531					goto jbad;
532				ap[1] = '1';
533				ap[2] = '\0';
534				break;
535			case '2':
536				if (ap[3] != '\0')
537					goto jbad;
538				ap[1] = '2';
539				ap[2] = '\0';
540				break;
541			case '\0':
542				break;
543			default:
544jbad:				errx(1, "illegal option -- %s", ap);
545				usage();
546			}
547			break;
548		case 'o':
549			/*
550			 * The original join allowed "-o arg arg".
551			 * Convert to "-o arg -o arg".
552			 */
553			if (ap[2] != '\0')
554				break;
555			for (p = argv + 2; *p; ++p) {
556				if (p[0][0] != '1' &&
557				    p[0][0] != '2' || p[0][1] != '.')
558					break;
559				len = strlen(*p);
560				if (len - 2 != strspn(*p + 2, "0123456789"))
561					break;
562				if ((t = malloc(len + 3)) == NULL)
563					err(1, NULL);
564				t[0] = '-';
565				t[1] = 'o';
566				memmove(t + 2, *p, len + 1);
567				*p = t;
568			}
569			argv = p - 1;
570			break;
571		}
572	}
573}
574
575void
576usage()
577{
578	(void)fprintf(stderr, "%s%s\n",
579	    "usage: join [-a fileno | -v fileno ] [-e string] [-1 field] ",
580	    "[-2 field]\n            [-o list] [-t char] file1 file2");
581	exit(1);
582}
583