join.c revision 24360
1/*-
2 * Copyright (c) 1991, 1993, 1994
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Steve Hayman of the Computer Science Department, Indiana University,
7 * Michiro Hikida and David Goodenough.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38#ifndef lint
39static char copyright[] =
40"@(#) Copyright (c) 1991, 1993, 1994\n\
41	The Regents of the University of California.  All rights reserved.\n";
42#endif /* not lint */
43
44#ifndef lint
45static char sccsid[] = "@(#)join.c	8.6 (Berkeley) 5/4/95";
46#endif /* not lint */
47
48#include <sys/param.h>
49
50#include <ctype.h>
51#include <err.h>
52#include <errno.h>
53#include <stdio.h>
54#include <stdlib.h>
55#include <string.h>
56#include <unistd.h>
57
58/*
59 * There's a structure per input file which encapsulates the state of the
60 * file.  We repeatedly read lines from each file until we've read in all
61 * the consecutive lines from the file with a common join field.  Then we
62 * compare the set of lines with an equivalent set from the other file.
63 */
64typedef struct {
65	char *line;		/* line */
66	u_long linealloc;	/* line allocated count */
67	char **fields;		/* line field(s) */
68	u_long fieldcnt;	/* line field(s) count */
69	u_long fieldalloc;	/* line field(s) allocated count */
70} LINE;
71
72typedef struct {
73	FILE *fp;		/* file descriptor */
74	u_long joinf;		/* join field (-1, -2, -j) */
75	int unpair;		/* output unpairable lines (-a) */
76	int number;		/* 1 for file 1, 2 for file 2 */
77
78	LINE *set;		/* set of lines with same field */
79	int pushbool;		/* if pushback is set */
80	u_long pushback;	/* line on the stack */
81	u_long setcnt;		/* set count */
82	u_long setalloc;	/* set allocated count */
83} INPUT;
84INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, },
85      input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, };
86
87typedef struct {
88	u_long	filenum;	/* file number */
89	u_long	fieldno;	/* field number */
90} OLIST;
91OLIST *olist;			/* output field list */
92u_long olistcnt;		/* output field list count */
93u_long olistalloc;		/* output field allocated count */
94
95int joinout = 1;		/* show lines with matched join fields (-v) */
96int needsep;			/* need separator character */
97int spans = 1;			/* span multiple delimiters (-t) */
98char *empty;			/* empty field replacement string (-e) */
99char *tabchar = " \t";		/* delimiter characters (-t) */
100
101int  cmp __P((LINE *, u_long, LINE *, u_long));
102void fieldarg __P((char *));
103void joinlines __P((INPUT *, INPUT *));
104void obsolete __P((char **));
105void outfield __P((LINE *, u_long, int));
106void outoneline __P((INPUT *, LINE *));
107void outtwoline __P((INPUT *, LINE *, INPUT *, LINE *));
108void slurp __P((INPUT *));
109void usage __P((void));
110
111int
112main(argc, argv)
113	int argc;
114	char *argv[];
115{
116	INPUT *F1, *F2;
117	int aflag, ch, cval, vflag;
118	char *end;
119
120	F1 = &input1;
121	F2 = &input2;
122
123	aflag = vflag = 0;
124	obsolete(argv);
125	while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) {
126		switch (ch) {
127		case '\01':		/* See comment in obsolete(). */
128			aflag = 1;
129			F1->unpair = F2->unpair = 1;
130			break;
131		case '1':
132			if ((F1->joinf = strtol(optarg, &end, 10)) < 1)
133				errx(1, "-1 option field number less than 1");
134			if (*end)
135				errx(1, "illegal field number -- %s", optarg);
136			--F1->joinf;
137			break;
138		case '2':
139			if ((F2->joinf = strtol(optarg, &end, 10)) < 1)
140				errx(1, "-2 option field number less than 1");
141			if (*end)
142				errx(1, "illegal field number -- %s", optarg);
143			--F2->joinf;
144			break;
145		case 'a':
146			aflag = 1;
147			switch(strtol(optarg, &end, 10)) {
148			case 1:
149				F1->unpair = 1;
150				break;
151			case 2:
152				F2->unpair = 1;
153				break;
154			default:
155				errx(1, "-a option file number not 1 or 2");
156				break;
157			}
158			if (*end)
159				errx(1, "illegal file number -- %s", optarg);
160			break;
161		case 'e':
162			empty = optarg;
163			break;
164		case 'j':
165			if ((F1->joinf = F2->joinf =
166			    strtol(optarg, &end, 10)) < 1)
167				errx(1, "-j option field number less than 1");
168			if (*end)
169				errx(1, "illegal field number -- %s", optarg);
170			--F1->joinf;
171			--F2->joinf;
172			break;
173		case 'o':
174			fieldarg(optarg);
175			break;
176		case 't':
177			spans = 0;
178			if (strlen(tabchar = optarg) != 1)
179				errx(1, "illegal tab character specification");
180			break;
181		case 'v':
182			vflag = 1;
183			joinout = 0;
184			switch (strtol(optarg, &end, 10)) {
185			case 1:
186				F1->unpair = 1;
187				break;
188			case 2:
189				F2->unpair = 1;
190				break;
191			default:
192				errx(1, "-v option file number not 1 or 2");
193				break;
194			}
195			if (*end)
196				errx(1, "illegal file number -- %s", optarg);
197			break;
198		case '?':
199		default:
200			usage();
201		}
202	}
203	argc -= optind;
204	argv += optind;
205
206	if (aflag && vflag)
207		errx(1, "the -a and -v options are mutually exclusive");
208
209	if (argc != 2)
210		usage();
211
212	/* Open the files; "-" means stdin. */
213	if (!strcmp(*argv, "-"))
214		F1->fp = stdin;
215	else if ((F1->fp = fopen(*argv, "r")) == NULL)
216		err(1, "%s", *argv);
217	++argv;
218	if (!strcmp(*argv, "-"))
219		F2->fp = stdin;
220	else if ((F2->fp = fopen(*argv, "r")) == NULL)
221		err(1, "%s", *argv);
222	if (F1->fp == stdin && F2->fp == stdin)
223		errx(1, "only one input file may be stdin");
224
225	slurp(F1);
226	slurp(F2);
227	while (F1->setcnt && F2->setcnt) {
228		cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
229		if (cval == 0) {
230			/* Oh joy, oh rapture, oh beauty divine! */
231			if (joinout)
232				joinlines(F1, F2);
233			slurp(F1);
234			slurp(F2);
235		} else if (cval < 0) {
236			/* File 1 takes the lead... */
237			if (F1->unpair)
238				joinlines(F1, NULL);
239			slurp(F1);
240		} else {
241			/* File 2 takes the lead... */
242			if (F2->unpair)
243				joinlines(F2, NULL);
244			slurp(F2);
245		}
246	}
247
248	/*
249	 * Now that one of the files is used up, optionally output any
250	 * remaining lines from the other file.
251	 */
252	if (F1->unpair)
253		while (F1->setcnt) {
254			joinlines(F1, NULL);
255			slurp(F1);
256		}
257	if (F2->unpair)
258		while (F2->setcnt) {
259			joinlines(F2, NULL);
260			slurp(F2);
261		}
262	exit(0);
263}
264
265void
266slurp(F)
267	INPUT *F;
268{
269	LINE *lp, *lastlp, tmp;
270	size_t len;
271	int cnt;
272	char *bp, *fieldp;
273
274	/*
275	 * Read all of the lines from an input file that have the same
276	 * join field.
277	 */
278	F->setcnt = 0;
279	for (lastlp = NULL;; ++F->setcnt) {
280		/*
281		 * If we're out of space to hold line structures, allocate
282		 * more.  Initialize the structure so that we know that this
283		 * is new space.
284		 */
285		if (F->setcnt == F->setalloc) {
286			cnt = F->setalloc;
287			F->setalloc += 50;
288			if ((F->set = realloc(F->set,
289			    F->setalloc * sizeof(LINE))) == NULL)
290				err(1, NULL);
291			memset(F->set + cnt, 0, 50 * sizeof(LINE));
292
293			/* re-set lastlp in case it moved */
294			if (lastlp != NULL)
295				lastlp = &F->set[F->setcnt - 1];
296		}
297
298		/*
299		 * Get any pushed back line, else get the next line.  Allocate
300		 * space as necessary.  If taking the line from the stack swap
301		 * the two structures so that we don't lose space allocated to
302		 * either structure.  This could be avoided by doing another
303		 * level of indirection, but it's probably okay as is.
304		 */
305		lp = &F->set[F->setcnt];
306		if (F->setcnt)
307			lastlp = &F->set[F->setcnt - 1];
308		if (F->pushbool) {
309			tmp = F->set[F->setcnt];
310			F->set[F->setcnt] = F->set[F->pushback];
311			F->set[F->pushback] = tmp;
312			F->pushbool = 0;
313			continue;
314		}
315		if ((bp = fgetln(F->fp, &len)) == NULL)
316			return;
317		if (lp->linealloc <= len + 1) {
318			lp->linealloc += MAX(100, len + 1 - lp->linealloc);
319			if ((lp->line =
320			    realloc(lp->line, lp->linealloc)) == NULL)
321				err(1, NULL);
322		}
323		memmove(lp->line, bp, len);
324
325		/* Replace trailing newline, if it exists. */
326		if (bp[len - 1] == '\n')
327			lp->line[len - 1] = '\0';
328		else
329			lp->line[len] = '\0';
330		bp = lp->line;
331
332		/* Split the line into fields, allocate space as necessary. */
333		lp->fieldcnt = 0;
334		while ((fieldp = strsep(&bp, tabchar)) != NULL) {
335			if (spans && *fieldp == '\0')
336				continue;
337			if (lp->fieldcnt == lp->fieldalloc) {
338				lp->fieldalloc += 50;
339				if ((lp->fields = realloc(lp->fields,
340				    lp->fieldalloc * sizeof(char *))) == NULL)
341					err(1, NULL);
342			}
343			lp->fields[lp->fieldcnt++] = fieldp;
344		}
345
346		/* See if the join field value has changed. */
347		if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
348			F->pushbool = 1;
349			F->pushback = F->setcnt;
350			break;
351		}
352	}
353}
354
355int
356cmp(lp1, fieldno1, lp2, fieldno2)
357	LINE *lp1, *lp2;
358	u_long fieldno1, fieldno2;
359{
360	if (lp1->fieldcnt <= fieldno1)
361		return (lp2->fieldcnt <= fieldno2 ? 0 : 1);
362	if (lp2->fieldcnt <= fieldno2)
363		return (-1);
364	return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2]));
365}
366
367void
368joinlines(F1, F2)
369	INPUT *F1, *F2;
370{
371	int cnt1, cnt2;
372
373	/*
374	 * Output the results of a join comparison.  The output may be from
375	 * either file 1 or file 2 (in which case the first argument is the
376	 * file from which to output) or from both.
377	 */
378	if (F2 == NULL) {
379		for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
380			outoneline(F1, &F1->set[cnt1]);
381		return;
382	}
383	for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
384		for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
385			outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
386}
387
388void
389outoneline(F, lp)
390	INPUT *F;
391	LINE *lp;
392{
393	int cnt;
394
395	/*
396	 * Output a single line from one of the files, according to the
397	 * join rules.  This happens when we are writing unmatched single
398	 * lines.  Output empty fields in the right places.
399	 */
400	if (olist)
401		for (cnt = 0; cnt < olistcnt; ++cnt) {
402			if (olist[cnt].filenum == F->number)
403				outfield(lp, olist[cnt].fieldno, 0);
404			else
405				outfield(lp, 0, 1);
406		}
407	else
408		for (cnt = 0; cnt < lp->fieldcnt; ++cnt)
409			outfield(lp, cnt, 0);
410	(void)printf("\n");
411	if (ferror(stdout))
412		err(1, "stdout");
413	needsep = 0;
414}
415
416void
417outtwoline(F1, lp1, F2, lp2)
418	INPUT *F1, *F2;
419	LINE *lp1, *lp2;
420{
421	int cnt;
422
423	/* Output a pair of lines according to the join list (if any). */
424	if (olist)
425		for (cnt = 0; cnt < olistcnt; ++cnt)
426			if (olist[cnt].filenum == 1)
427				outfield(lp1, olist[cnt].fieldno, 0);
428			else /* if (olist[cnt].filenum == 2) */
429				outfield(lp2, olist[cnt].fieldno, 0);
430	else {
431		/*
432		 * Output the join field, then the remaining fields from F1
433		 * and F2.
434		 */
435		outfield(lp1, F1->joinf, 0);
436		for (cnt = 0; cnt < lp1->fieldcnt; ++cnt)
437			if (F1->joinf != cnt)
438				outfield(lp1, cnt, 0);
439		for (cnt = 0; cnt < lp2->fieldcnt; ++cnt)
440			if (F2->joinf != cnt)
441				outfield(lp2, cnt, 0);
442	}
443	(void)printf("\n");
444	if (ferror(stdout))
445		err(1, "stdout");
446	needsep = 0;
447}
448
449void
450outfield(lp, fieldno, out_empty)
451	LINE *lp;
452	u_long fieldno;
453	int out_empty;
454{
455	if (needsep++)
456		(void)printf("%c", *tabchar);
457	if (!ferror(stdout))
458		if (lp->fieldcnt <= fieldno || out_empty) {
459			if (empty != NULL)
460				(void)printf("%s", empty);
461		} else {
462			if (*lp->fields[fieldno] == '\0')
463				return;
464			(void)printf("%s", lp->fields[fieldno]);
465		}
466	if (ferror(stdout))
467		err(1, "stdout");
468}
469
470/*
471 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
472 * fields.
473 */
474void
475fieldarg(option)
476	char *option;
477{
478	u_long fieldno;
479	char *end, *token;
480
481	while ((token = strsep(&option, ", \t")) != NULL) {
482		if (*token == '\0')
483			continue;
484		if (token[0] != '1' && token[0] != '2' || token[1] != '.')
485			errx(1, "malformed -o option field");
486		fieldno = strtol(token + 2, &end, 10);
487		if (*end)
488			errx(1, "malformed -o option field");
489		if (fieldno == 0)
490			errx(1, "field numbers are 1 based");
491		if (olistcnt == olistalloc) {
492			olistalloc += 50;
493			if ((olist = realloc(olist,
494			    olistalloc * sizeof(OLIST))) == NULL)
495				err(1, NULL);
496		}
497		olist[olistcnt].filenum = token[0] - '0';
498		olist[olistcnt].fieldno = fieldno - 1;
499		++olistcnt;
500	}
501}
502
503void
504obsolete(argv)
505	char **argv;
506{
507	int len;
508	char **p, *ap, *t;
509
510	while ((ap = *++argv) != NULL) {
511		/* Return if "--". */
512		if (ap[0] == '-' && ap[1] == '-')
513			return;
514		switch (ap[1]) {
515		case 'a':
516			/*
517			 * The original join allowed "-a", which meant the
518			 * same as -a1 plus -a2.  POSIX 1003.2, Draft 11.2
519			 * only specifies this as "-a 1" and "a -2", so we
520			 * have to use another option flag, one that is
521			 * unlikely to ever be used or accidentally entered
522			 * on the command line.  (Well, we could reallocate
523			 * the argv array, but that hardly seems worthwhile.)
524			 */
525			if (ap[2] == '\0')
526				ap[1] = '\01';
527			break;
528		case 'j':
529			/*
530			 * The original join allowed "-j[12] arg" and "-j arg".
531			 * Convert the former to "-[12] arg".  Don't convert
532			 * the latter since getopt(3) can handle it.
533			 */
534			switch(ap[2]) {
535			case '1':
536				if (ap[3] != '\0')
537					goto jbad;
538				ap[1] = '1';
539				ap[2] = '\0';
540				break;
541			case '2':
542				if (ap[3] != '\0')
543					goto jbad;
544				ap[1] = '2';
545				ap[2] = '\0';
546				break;
547			case '\0':
548				break;
549			default:
550jbad:				errx(1, "illegal option -- %s", ap);
551				usage();
552			}
553			break;
554		case 'o':
555			/*
556			 * The original join allowed "-o arg arg".
557			 * Convert to "-o arg -o arg".
558			 */
559			if (ap[2] != '\0')
560				break;
561			for (p = argv + 2; *p; ++p) {
562				if (p[0][0] != '1' &&
563				    p[0][0] != '2' || p[0][1] != '.')
564					break;
565				len = strlen(*p);
566				if (len - 2 != strspn(*p + 2, "0123456789"))
567					break;
568				if ((t = malloc(len + 3)) == NULL)
569					err(1, NULL);
570				t[0] = '-';
571				t[1] = 'o';
572				memmove(t + 2, *p, len + 1);
573				*p = t;
574			}
575			argv = p - 1;
576			break;
577		}
578	}
579}
580
581void
582usage()
583{
584	(void)fprintf(stderr, "%s%s\n",
585	    "usage: join [-a fileno | -v fileno ] [-e string] [-1 field] ",
586	    "[-2 field]\n            [-o list] [-t char] file1 file2");
587	exit(1);
588}
589